]> git.donarmstrong.com Git - mothur.git/blob - getoturepcommand.cpp
get.oturep change and trim.seqs fix
[mothur.git] / getoturepcommand.cpp
1 /*
2  *  getoturepcommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/6/09.
6  *  Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
7  *
8  */
9
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
16
17
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21         return (left.name < right.name);        
22 }
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26         return (left.bin < right.bin);  
27 }
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31         return (left.size < right.size);        
32 }
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36         return (left.group < right.group);      
37 }
38 //**********************************************************************************************************************
39 GetOTURepCommand::GetOTURepCommand(){   
40         try {
41                 abort = true;
42                 //initialize outputTypes
43                 vector<string> tempOutNames;
44                 outputTypes["fasta"] = tempOutNames;
45                 outputTypes["name"] = tempOutNames;
46         }
47         catch(exception& e) {
48                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
49                 exit(1);
50         }
51 }
52 //**********************************************************************************************************************
53 vector<string> GetOTURepCommand::getValidParameters(){  
54         try {
55                 string Array[] =  {"fasta","list","label","name", "group", "weighted","sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
56                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
57                 return myArray;
58         }
59         catch(exception& e) {
60                 m->errorOut(e, "GetOTURepCommand", "getValidParameters");
61                 exit(1);
62         }
63 }
64 //**********************************************************************************************************************
65 vector<string> GetOTURepCommand::getRequiredParameters(){       
66         try {
67                 string Array[] =  {"fasta","list"};
68                 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
69                 return myArray;
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "GetOTURepCommand", "getRequiredParameters");
73                 exit(1);
74         }
75 }
76 //**********************************************************************************************************************
77 vector<string> GetOTURepCommand::getRequiredFiles(){    
78         try {
79                 vector<string> myArray;
80                 return myArray;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "GetOTURepCommand", "getRequiredFiles");
84                 exit(1);
85         }
86 }
87 //**********************************************************************************************************************
88 GetOTURepCommand::GetOTURepCommand(string option)  {
89         try{
90                 globaldata = GlobalData::getInstance();
91                 abort = false;
92                 allLines = 1;
93                 labels.clear();
94                                 
95                 //allow user to run help
96                 if (option == "help") { 
97                         help(); abort = true;
98                 } else {
99                         //valid paramters for this command
100                         string Array[] =  {"fasta","list","label","name","weighted", "group", "sorted", "phylip","column","large","cutoff","precision","groups","outputdir","inputdir"};
101                         vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
102                         
103                         OptionParser parser(option);
104                         map<string, string> parameters = parser.getParameters();
105                         
106                         ValidParameters validParameter;
107                         map<string, string>::iterator it;
108                 
109                         //check to make sure all parameters are valid for command
110                         for (it = parameters.begin(); it != parameters.end(); it++) { 
111                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
112                         }
113                         
114                         //initialize outputTypes
115                         vector<string> tempOutNames;
116                         outputTypes["fasta"] = tempOutNames;
117                         outputTypes["name"] = tempOutNames;
118                         
119                         //if the user changes the input directory command factory will send this info to us in the output parameter 
120                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
121                         if (inputDir == "not found"){   inputDir = "";          }
122                         else {
123                                 string path;
124                                 it = parameters.find("list");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
130                                 }
131                                 
132                                 it = parameters.find("fasta");
133                                 //user has given a template file
134                                 if(it != parameters.end()){ 
135                                         path = m->hasPath(it->second);
136                                         //if the user has not given a path then, add inputdir. else leave path alone.
137                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
138                                 }
139                                 
140                                 it = parameters.find("phylip");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["phylip"] = inputDir + it->second;           }
146                                 }
147                                 
148                                 it = parameters.find("column");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["column"] = inputDir + it->second;           }
154                                 }
155                                 
156                                 it = parameters.find("name");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
162                                 }
163                                 
164                                 it = parameters.find("group");
165                                 //user has given a template file
166                                 if(it != parameters.end()){ 
167                                         path = m->hasPath(it->second);
168                                         //if the user has not given a path then, add inputdir. else leave path alone.
169                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
170                                 }
171                         }
172
173                         
174                         //if the user changes the output directory command factory will send this info to us in the output parameter 
175                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
176                         
177                         //check for required parameters
178                         fastafile = validParameter.validFile(parameters, "fasta", true);
179                         if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
180                         else if (fastafile == "not open") { abort = true; }     
181                 
182                         listfile = validParameter.validFile(parameters, "list", true);
183                         if (listfile == "not found") { m->mothurOut("list is a required parameter for the get.oturep command."); m->mothurOutEndLine(); abort = true; }
184                         else if (listfile == "not open") { abort = true; }      
185                         
186                         phylipfile = validParameter.validFile(parameters, "phylip", true);
187                         if (phylipfile == "not found") { phylipfile = "";  }
188                         else if (phylipfile == "not open") { abort = true; }    
189                         else { distFile = phylipfile; format = "phylip";   }
190                         
191                         columnfile = validParameter.validFile(parameters, "column", true);
192                         if (columnfile == "not found") { columnfile = ""; }
193                         else if (columnfile == "not open") { abort = true; }    
194                         else { distFile = columnfile; format = "column";   }
195                         
196                         namefile = validParameter.validFile(parameters, "name", true);
197                         if (namefile == "not open") { abort = true; }   
198                         else if (namefile == "not found") { namefile = ""; }
199                         
200                         if ((phylipfile == "") && (columnfile == "")) { m->mothurOut("When executing a get.oturep command you must enter a phylip or a column."); m->mothurOutEndLine(); abort = true; }
201                         else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
202                 
203                         if (columnfile != "") {  if (namefile == "") {  cout << "You need to provide a namefile if you are going to use the column format." << endl; abort = true; }  }
204
205                         //check for optional parameter and set defaults
206                         // ...at some point should added some additional type checking...
207                         label = validParameter.validFile(parameters, "label", false);                   
208                         if (label == "not found") { label = ""; allLines = 1;  }
209                         else { 
210                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
211                                 else { allLines = 1;  }
212                         }
213                         
214                         groupfile = validParameter.validFile(parameters, "group", true);
215                         if (groupfile == "not open") { groupfile = ""; abort = true; }
216                         else if (groupfile == "not found") { groupfile = ""; }
217                                                 
218                         sorted = validParameter.validFile(parameters, "sorted", false);         if (sorted == "not found"){     sorted = "";    }
219                         if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
220                                 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
221                                 sorted = "";
222                         }
223                         
224                         if ((sorted == "group") && (groupfile == "")) {
225                                 m->mothurOut("You must provide a groupfile to sort by group. I will not sort."); m->mothurOutEndLine();
226                                 sorted = "";
227                         }
228                         
229                         groups = validParameter.validFile(parameters, "groups", false);                 
230                         if (groups == "not found") { groups = ""; }
231                         else { 
232                                 if (groupfile == "") {
233                                         m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
234                                         abort = true;
235                                 }else { 
236                                         m->splitAtDash(groups, Groups);
237                                 }
238                         }
239                         globaldata->Groups = Groups;
240                         
241                         string temp = validParameter.validFile(parameters, "large", false);             if (temp == "not found") {      temp = "F";     }
242                         large = m->isTrue(temp);
243                         
244                         temp = validParameter.validFile(parameters, "weighted", false);         if (temp == "not found") {      if (namefile == "") { temp = "F"; } else { temp = "t"; }        }
245                         weighted = m->isTrue(temp);
246                         
247                         if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
248                         
249                         temp = validParameter.validFile(parameters, "precision", false);                        if (temp == "not found") { temp = "100"; }
250                         convert(temp, precision); 
251                         
252                         temp = validParameter.validFile(parameters, "cutoff", false);                   if (temp == "not found") { temp = "10.0"; }
253                         convert(temp, cutoff); 
254                         cutoff += (5 / (precision * 10.0));
255                 }
256         }
257         catch(exception& e) {
258                 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
259                 exit(1);
260         }
261 }
262
263 //**********************************************************************************************************************
264
265 void GetOTURepCommand::help(){
266         try {
267                 m->mothurOut("The get.oturep command parameters are phylip, column, list, fasta, name, group, large, weighted, cutoff, precision, groups, sorted and label.  The fasta and list parameters are required, as well as phylip or column and name.\n");
268                 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n");
269                 m->mothurOut("The phylip or column parameter is required, but only one may be used.  If you use a column file the name filename is required. \n");
270                 m->mothurOut("If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n");
271                 m->mothurOut("The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
272                 m->mothurOut("Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n");
273                 m->mothurOut("The default value for label is all labels in your inputfile.\n");
274                 m->mothurOut("The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n");
275                 m->mothurOut("The large parameter allows you to indicate that your distance matrix is too large to fit in RAM.  The default value is false.\n");
276                 m->mothurOut("The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true.  The default value is false with no namesfile and true when a name file is provided.\n");
277                 m->mothurOut("The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n");
278                 m->mothurOut("For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n");
279                 m->mothurOut("For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n");
280                 m->mothurOut("If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n");
281                 m->mothurOut("The group parameter allows you provide a group file.\n");
282                 m->mothurOut("The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n");
283                 m->mothurOut("The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n");
284                 m->mothurOut("If you provide a groupfile, then it also appends the names of the groups present in that bin.\n");
285                 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
286         }
287         catch(exception& e) {
288                 m->errorOut(e, "GetOTURepCommand", "help");
289                 exit(1);
290         }
291 }
292
293 //**********************************************************************************************************************
294
295 GetOTURepCommand::~GetOTURepCommand(){}
296
297 //**********************************************************************************************************************
298
299 int GetOTURepCommand::execute(){
300         try {
301         
302                 if (abort == true) { return 0; }
303                 int error;
304                 
305                 if (!large) {
306                         //read distance files
307                         if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }        
308                         else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
309                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
310                         
311                         readMatrix->setCutoff(cutoff);
312         
313                         if(namefile != ""){     
314                                 nameMap = new NameAssignment(namefile);
315                                 nameMap->readMap();
316                         }else{  nameMap = NULL;         }
317                         
318                         readMatrix->read(nameMap);
319                         
320                         if (m->control_pressed) { delete readMatrix; return 0; }
321
322                         //get matrix
323                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
324                         globaldata->gListVector = readMatrix->getListVector();
325
326                         SparseMatrix* matrix = readMatrix->getMatrix();
327                         
328                         // Create a data structure to quickly access the distance information.
329                         // It consists of a vector of distance maps, where each map contains
330                         // all distances of a certain sequence. Vector and maps are accessed
331                         // via the index of a sequence in the distance matrix
332                         seqVec = vector<SeqMap>(globaldata->gListVector->size()); 
333                         for (MatData currentCell = matrix->begin(); currentCell != matrix->end(); currentCell++) {
334                                 if (m->control_pressed) { delete readMatrix; return 0; }
335                                 seqVec[currentCell->row][currentCell->column] = currentCell->dist;
336                         }
337                         //add dummy map for unweighted calc
338                         SeqMap dummy;
339                         seqVec.push_back(dummy);
340                         
341                         delete matrix;
342                         delete readMatrix;
343                         delete nameMap;
344                         
345                         if (m->control_pressed) { return 0; }
346                 }else {
347                         //process file and set up indexes
348                         if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }    
349                         else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
350                         else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0;  }
351                         
352                         formatMatrix->setCutoff(cutoff);
353         
354                         if(namefile != ""){     
355                                 nameMap = new NameAssignment(namefile);
356                                 nameMap->readMap();
357                         }else{  nameMap = NULL;         }
358                         
359                         formatMatrix->read(nameMap);
360                         
361                         if (m->control_pressed) { delete formatMatrix;  return 0; }
362
363                         //get matrix
364                         if (globaldata->gListVector != NULL) {  delete globaldata->gListVector;  }
365                         globaldata->gListVector = formatMatrix->getListVector();
366                         
367                         distFile = formatMatrix->getFormattedFileName();
368                         
369                         //positions in file where the distances for each sequence begin
370                         //rowPositions[1] = position in file where distance related to sequence 1 start.
371                         rowPositions = formatMatrix->getRowPositions();
372                         rowPositions.push_back(-1); //dummy row for unweighted calc
373                         
374                         delete formatMatrix;
375                         delete nameMap;
376                         
377                         //openfile for getMap to use
378                         m->openInputFile(distFile, inRow);
379                         
380                         if (m->control_pressed) { inRow.close(); remove(distFile.c_str()); return 0; }
381                 }
382                 
383                 
384                 //globaldata->gListVector bin 0 = first name read in distance matrix, globaldata->gListVector bin 1 = second name read in distance matrix
385                 if (globaldata->gListVector != NULL) {
386                         vector<string> names;
387                         string binnames;
388                         //map names to rows in sparsematrix
389                         for (int i = 0; i < globaldata->gListVector->size(); i++) {
390                                 names.clear();
391                                 binnames = globaldata->gListVector->get(i);
392                                 
393                                 m->splitAtComma(binnames, names);
394                                 
395                                 for (int j = 0; j < names.size(); j++) {
396                                         nameToIndex[names[j]] = i;
397                                 }
398                         }
399                 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
400                 
401                                 
402                 if (m->control_pressed) { 
403                         if (large) {  inRow.close(); remove(distFile.c_str());  }
404                         return 0; 
405                 }
406                 
407                 if (groupfile != "") {
408                         //read in group map info.
409                         groupMap = new GroupMap(groupfile);
410                         int error = groupMap->readMap();
411                         if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = "";  }
412                         
413                         if (Groups.size() != 0) {
414                                 SharedUtil* util = new SharedUtil();
415                                 util->setGroups(Groups, groupMap->namesOfGroups, "getoturep");
416                                 delete util;
417                         }
418                 }
419                                                                 
420                 //set format to list so input can get listvector
421                 globaldata->setFormat("list");
422         
423                 //read list file
424                 read = new ReadOTUFile(listfile);
425                 read->read(&*globaldata); 
426                 
427                 input = globaldata->ginput;
428                 list = globaldata->gListVector;
429                 string lastLabel = list->getLabel();
430
431                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
432                 set<string> processedLabels;
433                 set<string> userLabels = labels;
434                 
435                 if (m->control_pressed) { 
436                         if (large) {  inRow.close(); remove(distFile.c_str());  }
437                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
438                 }
439                 
440                 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
441                 
442                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
443                         
444                         if (allLines == 1 || labels.count(list->getLabel()) == 1){
445                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
446                                         error = process(list);
447                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
448                                         
449                                         if (m->control_pressed) { 
450                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
451                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
452                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
453                                         }
454                                         
455                                         processedLabels.insert(list->getLabel());
456                                         userLabels.erase(list->getLabel());
457                         }
458                         
459                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
460                                         string saveLabel = list->getLabel();
461                                         
462                                         delete list;
463                                         list = input->getListVector(lastLabel);
464                                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
465                                         error = process(list);
466                                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
467                                         
468                                         if (m->control_pressed) { 
469                                                 if (large) {  inRow.close(); remove(distFile.c_str());  }
470                                                 for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
471                                                 delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
472                                         }
473                                         
474                                         processedLabels.insert(list->getLabel());
475                                         userLabels.erase(list->getLabel());
476                                         
477                                         //restore real lastlabel to save below
478                                         list->setLabel(saveLabel);
479                         }
480                         
481                         lastLabel = list->getLabel();
482         
483                         delete list;
484                         list = input->getListVector();
485                 }
486                 
487                 //output error messages about any remaining user labels
488                 bool needToRun = false;
489                 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {  
490                         m->mothurOut("Your file does not include the label " + (*it)); 
491                         if (processedLabels.count(lastLabel) != 1) {
492                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
493                                 needToRun = true;
494                         }else {
495                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
496                         }
497                 }
498                 
499                 //run last label if you need to
500                 if (needToRun == true)  {
501                         if (list != NULL) {     delete list;    }
502                         list = input->getListVector(lastLabel);
503                         m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
504                         error = process(list);
505                         delete list;
506                         if (error == 1) { return 0; } //there is an error in hte input files, abort command
507                         
508                         if (m->control_pressed) { 
509                                         if (large) {  inRow.close(); remove(distFile.c_str());  }
510                                         for (int i = 0; i < outputNames.size(); i++) {  remove(outputNames[i].c_str());  } outputTypes.clear();
511                                         delete read; delete input; delete list; globaldata->gListVector = NULL; return 0; 
512                         }
513                 }
514                 
515                 //close and remove formatted matrix file
516                 if (large) {
517                         inRow.close();
518                         remove(distFile.c_str());
519                 }
520                 
521                 globaldata->gListVector = NULL;
522                 delete input;  globaldata->ginput = NULL;
523                 delete read;
524                 
525                 if (!weighted) { nameFileMap.clear(); }
526                 
527                 //read fastafile
528                 fasta = new FastaMap();
529                 fasta->readFastaFile(fastafile);
530                 
531                 //if user gave a namesfile then use it
532                 if (namefile != "") {   readNamesFile();        }
533                 
534                 //output create and output the .rep.fasta files
535                 map<string, string>::iterator itNameFile;
536                 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
537                         processNames(itNameFile->first, itNameFile->second);
538                 }
539                 
540                 delete fasta;
541                 if (groupfile != "") {
542                         delete groupMap;  globaldata->gGroupmap = NULL;
543                 }
544                 
545                 if (m->control_pressed) {  return 0; }
546                 
547                 m->mothurOutEndLine();
548                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
549                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
550                 m->mothurOutEndLine();
551                 
552                 return 0;
553         }
554         catch(exception& e) {
555                 m->errorOut(e, "GetOTURepCommand", "execute");
556                 exit(1);
557         }
558 }
559
560 //**********************************************************************************************************************
561 void GetOTURepCommand::readNamesFile() {
562         try {
563                 vector<string> dupNames;
564                 m->openInputFile(namefile, inNames);
565                 
566                 string name, names, sequence;
567         
568                 while(!inNames.eof()){
569                         inNames >> name;                        //read from first column  A
570                         inNames >> names;               //read from second column  A,B,C,D
571                         
572                         dupNames.clear();
573                         
574                         //parse names into vector
575                         m->splitAtComma(names, dupNames);
576                         
577                         //store names in fasta map
578                         sequence = fasta->getSequence(name);
579                         for (int i = 0; i < dupNames.size(); i++) {
580                                 fasta->push_back(dupNames[i], sequence);
581                         }
582                 
583                         m->gobble(inNames);
584                 }
585                 inNames.close();
586
587         }
588         catch(exception& e) {
589                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
590                 exit(1);
591         }
592 }
593 //**********************************************************************************************************************
594 //read names file to find the weighted rep for each bin
595 void GetOTURepCommand::readNamesFile(bool w) {
596         try {
597                 vector<string> dupNames;
598                 m->openInputFile(namefile, inNames);
599                 
600                 string name, names, sequence;
601                 
602                 while(!inNames.eof()){
603                         inNames >> name;        m->gobble(inNames);             //read from first column  A
604                         inNames >> names;                                                       //read from second column  A,B,C,D
605                         
606                         dupNames.clear();
607                         
608                         //parse names into vector
609                         m->splitAtComma(names, dupNames);
610                         
611                         for (int i = 0; i < dupNames.size(); i++) {
612                                 nameFileMap[dupNames[i]] = name;
613                         }
614                         
615                         m->gobble(inNames);
616                 }
617                 inNames.close();
618                 
619         }
620         catch(exception& e) {
621                 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
622                 exit(1);
623         }
624 }
625 //**********************************************************************************************************************
626 string GetOTURepCommand::findRep(vector<string> names) {
627         try{
628                 // if only 1 sequence in bin or processing the "unique" label, then 
629                 // the first sequence of the OTU is the representative one
630                 if ((names.size() == 1) || (list->getLabel() == "unique")) {
631                         return names[0];
632                 }else{
633                         vector<int> seqIndex(names.size());
634                         vector<float> max_dist(names.size());
635                         vector<float> total_dist(names.size());
636                         map<string, string>::iterator itNameFile;
637                         map<string, int>::iterator itNameIndex;
638
639                         //fill seqIndex and initialize sums
640                         for (size_t i = 0; i < names.size(); i++) {
641                                 if (weighted) {
642                                         seqIndex[i] = nameToIndex[names[i]];
643                                 }else { 
644                                         if (namefile == "") {
645                                                 itNameIndex = nameToIndex.find(names[i]);
646                                                 
647                                                 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
648                                                         if (large) {  seqIndex[i] = (rowPositions.size()-1); }
649                                                         else {  seqIndex[i] = (seqVec.size()-1); }
650                                                 }else {
651                                                         seqIndex[i] = itNameIndex->second;
652                                                 }
653                                                 
654                                         }else {
655                                                 itNameFile = nameFileMap.find(names[i]);
656                                                 
657                                                 if (itNameFile == nameFileMap.end()) {
658                                                         m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; 
659                                                 }else{
660                                                         string name1 = itNameFile->first;
661                                                         string name2 = itNameFile->second;
662                                                         
663                                                         if (name1 == name2) { //then you are unique so add your real dists
664                                                                 seqIndex[i] = nameToIndex[names[i]];
665                                                         }else { //add dummy
666                                                                 if (large) {  seqIndex[i] = (rowPositions.size()-1); }
667                                                                 else {  seqIndex[i] = (seqVec.size()-1); }
668                                                         }
669                                                 }
670                                         }
671                                 }
672                                 max_dist[i] = 0.0;
673                                 total_dist[i] = 0.0;
674                         }
675                         
676                         // loop through all entries in seqIndex
677                         SeqMap::iterator it;
678                         SeqMap currMap;
679                         for (size_t i=0; i < seqIndex.size(); i++) {
680                                 if (m->control_pressed) {  return  "control"; }
681                         
682                                 if (!large) {   currMap = seqVec[seqIndex[i]];  }
683                                 else            {       currMap = getMap(seqIndex[i]);  }
684                                 
685                                 for (size_t j=0; j < seqIndex.size(); j++) {
686                                         it = currMap.find(seqIndex[j]);         
687                                         if (it != currMap.end()) {
688                                                 max_dist[i] = max(max_dist[i], it->second);
689                                                 max_dist[j] = max(max_dist[j], it->second);
690                                                 total_dist[i] += it->second;
691                                                 total_dist[j] += it->second;
692                                         }else{ //if you can't find the distance make it the cutoff
693                                                 max_dist[i] = max(max_dist[i], cutoff);
694                                                 max_dist[j] = max(max_dist[j], cutoff);
695                                                 total_dist[i] += cutoff;
696                                                 total_dist[j] += cutoff;
697                                         }
698                                 }
699                         }
700                         
701                         // sequence with the smallest maximum distance is the representative
702                         //if tie occurs pick sequence with smallest average distance
703                         float min = 10000;
704                         int minIndex;
705                         for (size_t i=0; i < max_dist.size(); i++) {
706                                 if (m->control_pressed) {  return  "control"; }
707                                 if (max_dist[i] < min) {
708                                         min = max_dist[i];
709                                         minIndex = i;
710                                 }else if (max_dist[i] == min) {
711                                         float currentAverage = total_dist[minIndex] / (float) total_dist.size();
712                                         float newAverage = total_dist[i] / (float) total_dist.size();
713                                         
714                                         if (newAverage < currentAverage) {
715                                                 min = max_dist[i];
716                                                 minIndex = i;
717                                         }
718                                 }
719                         }
720                         
721                         return(names[minIndex]);
722                 }
723         }
724         catch(exception& e) {
725                 m->errorOut(e, "GetOTURepCommand", "FindRep");
726                 exit(1);
727         }
728 }
729
730 //**********************************************************************************************************************
731 int GetOTURepCommand::process(ListVector* processList) {
732         try{
733                 string name, sequence;
734                 string nameRep;
735
736                 //create output file
737                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
738                                 
739                 ofstream newNamesOutput;
740                 string outputNamesFile;
741                 map<string, ofstream*> filehandles;
742                 
743                 if (Groups.size() == 0) { //you don't want to use groups
744                         outputNamesFile  = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".rep.names";
745                         m->openOutputFile(outputNamesFile, newNamesOutput);
746                         outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile); 
747                         outputNameFiles[outputNamesFile] = processList->getLabel();
748                 }else{ //you want to use groups
749                         ofstream* temp;
750                         for (int i=0; i<Groups.size(); i++) {
751                                 temp = new ofstream;
752                                 filehandles[Groups[i]] = temp;
753                                 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".rep.names";
754                                 
755                                 m->openOutputFile(outputNamesFile, *(temp));
756                                 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
757                                 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
758                         }
759                 }
760                 
761                 //for each bin in the list vector
762                 for (int i = 0; i < processList->size(); i++) {
763                         if (m->control_pressed) { 
764                                 out.close();  
765                                 if (Groups.size() == 0) { //you don't want to use groups
766                                         newNamesOutput.close();
767                                 }else{
768                                         for (int j=0; j<Groups.size(); j++) {
769                                                 (*(filehandles[Groups[j]])).close();
770                                                 delete filehandles[Groups[j]];
771                                         }
772                                 }
773                                 return 0; 
774                         }
775                         
776                         string temp = processList->get(i);
777                         vector<string> namesInBin;
778                         m->splitAtComma(temp, namesInBin);
779                         
780                         if (Groups.size() == 0) {
781                                 nameRep = findRep(namesInBin);
782                                 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
783                         }else{
784                                 map<string, vector<string> > NamesInGroup;
785                                 for (int j=0; j<Groups.size(); j++) { //initialize groups
786                                         NamesInGroup[Groups[j]].resize(0);
787                                 }
788                                 
789                                 for (int j=0; j<namesInBin.size(); j++) {
790                                         string thisgroup = groupMap->getGroup(namesInBin[j]);
791                                         
792                                         if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
793                                         
794                                         if (m->inUsersGroups(thisgroup, Groups)) { //add this name to correct group
795                                                 NamesInGroup[thisgroup].push_back(namesInBin[j]);
796                                         }
797                                 }
798                                 
799                                 //get rep for each group in otu
800                                 for (int j=0; j<Groups.size(); j++) {
801                                         if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
802                                                 //get rep for each group
803                                                 nameRep = findRep(NamesInGroup[Groups[j]]);
804                                                 
805                                                 //output group rep and other members of this group
806                                                 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
807                                                 
808                                                 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
809                                                         (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
810                                                 }
811                                                 //output last name
812                                                 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
813                                         }
814                                 }
815                         }
816                 }
817                 
818                 if (Groups.size() == 0) { //you don't want to use groups
819                         newNamesOutput.close();
820                 }else{
821                         for (int i=0; i<Groups.size(); i++) {
822                                 (*(filehandles[Groups[i]])).close();
823                                 delete filehandles[Groups[i]];
824                         }
825                 }
826                 
827                 return 0;
828
829         }
830         catch(exception& e) {
831                 m->errorOut(e, "GetOTURepCommand", "process");
832                 exit(1);
833         }
834 }
835 //**********************************************************************************************************************
836 int GetOTURepCommand::processNames(string filename, string label) {
837         try{
838
839                 //create output file
840                 if (outputDir == "") { outputDir += m->hasPath(listfile); }
841                 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + ".rep.fasta";
842                 m->openOutputFile(outputFileName, out);
843                 vector<repStruct> reps;
844                 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
845                 
846                 ofstream out2;
847                 string tempNameFile = filename + ".temp";
848                 m->openOutputFile(tempNameFile, out2);
849                 
850                 ifstream in;
851                 m->openInputFile(filename, in);
852                 
853                 int i = 0;
854                 while (!in.eof()) {
855                         string rep, binnames;
856                         in >> i >> rep >> binnames; m->gobble(in);
857                         out2 << rep << '\t' << binnames << endl;
858                         
859                         vector<string> names;
860                         m->splitAtComma(binnames, names);
861                         int binsize = names.size();
862                         
863                         //if you have a groupfile
864                         string group = "";
865                         if (groupfile != "") {
866                                 map<string, string> groups;
867                                 map<string, string>::iterator groupIt;
868                                 
869                                 //find the groups that are in this bin
870                                 for (size_t i = 0; i < names.size(); i++) {
871                                         string groupName = groupMap->getGroup(names[i]);
872                                         if (groupName == "not found") {  
873                                                 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
874                                                 groupError = true;
875                                         } else {
876                                                 groups[groupName] = groupName;
877                                         }
878                                 }
879                                 
880                                 //turn the groups into a string
881                                 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
882                                         group += groupIt->first + "-";
883                                 }
884                                 //rip off last dash
885                                 group = group.substr(0, group.length()-1);
886                         }else{ group = ""; }
887
888                         
889                         //print out name and sequence for that bin
890                         string sequence = fasta->getSequence(rep);
891
892                         if (sequence != "not found") {
893                                 if (sorted == "") { //print them out
894                                         rep = rep + "\t" + toString(i+1);
895                                         rep = rep + "|" + toString(binsize);
896                                         if (groupfile != "") {
897                                                 rep = rep + "|" + group;
898                                         }
899                                         out << ">" << rep << endl;
900                                         out << sequence << endl;
901                                 }else { //save them
902                                         repStruct newRep(rep, i+1, binsize, group);
903                                         reps.push_back(newRep);
904                                 }
905                         }else { 
906                                 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine(); 
907                         }
908                 }
909                 
910                         
911                 if (sorted != "") { //then sort them and print them
912                         if (sorted == "name")           {  sort(reps.begin(), reps.end(), compareName);         }
913                         else if (sorted == "bin")       {  sort(reps.begin(), reps.end(), compareBin);          }
914                         else if (sorted == "size")      {  sort(reps.begin(), reps.end(), compareSize);         }
915                         else if (sorted == "group")     {  sort(reps.begin(), reps.end(), compareGroup);        }
916                         
917                         //print them
918                         for (int i = 0; i < reps.size(); i++) {
919                                 string sequence = fasta->getSequence(reps[i].name);
920                                 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
921                                 outputName = outputName + "|" + toString(reps[i].size);
922                                 if (groupfile != "") {
923                                         outputName = outputName + "|" + reps[i].group;
924                                 }
925                                 out << ">" << outputName << endl;
926                                 out << sequence << endl;
927                         }
928                 }
929                 
930                 in.close();
931                 out.close();
932                 out2.close();
933                 
934                 remove(filename.c_str());
935                 rename(tempNameFile.c_str(), filename.c_str());
936                 
937                 return 0;
938
939         }
940         catch(exception& e) {
941                 m->errorOut(e, "GetOTURepCommand", "processNames");
942                 exit(1);
943         }
944 }
945 //**********************************************************************************************************************
946 SeqMap GetOTURepCommand::getMap(int row) {
947         try {
948                 SeqMap rowMap;
949                 
950                 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
951                 if (rowPositions[row] != -1){
952                         //go to row in file
953                         inRow.seekg(rowPositions[row]);
954                         
955                         int rowNum, numDists, colNum;
956                         float dist;
957                         
958                         inRow >> rowNum >> numDists;
959                         
960                         for(int i = 0; i < numDists; i++) {
961                                 inRow >> colNum >> dist;
962                                 rowMap[colNum] = dist;
963                                 
964                         }
965                 }
966                 
967                 return rowMap;
968         }
969         catch(exception& e) {
970                 m->errorOut(e, "GetOTURepCommand", "getMap");
971                 exit(1);
972         }
973 }
974 //**********************************************************************************************************************
975