5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
45 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
46 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName",false,false); parameters.push_back(pcount);
47 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
48 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
49 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
50 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
51 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
52 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
53 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
54 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
55 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
56 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
57 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
59 vector<string> myArray;
60 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
64 m->errorOut(e, "GetOTURepCommand", "setParameters");
68 //**********************************************************************************************************************
69 string GetOTURepCommand::getHelpString(){
71 string helpString = "";
72 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
73 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
74 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name or count filename is required. \n";
75 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
76 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
77 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
78 helpString += "The default value for label is all labels in your inputfile.\n";
79 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
80 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
81 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
82 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
83 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
84 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
85 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
86 helpString += "The group parameter allows you provide a group file.\n";
87 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
88 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
89 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
90 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
94 m->errorOut(e, "GetOTURepCommand", "getHelpString");
98 //**********************************************************************************************************************
99 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){
101 string outputFileName = "";
102 map<string, vector<string> >::iterator it;
104 //is this a type this command creates
105 it = outputTypes.find(type);
106 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
108 if (type == "fasta") { outputFileName = "rep.fasta"; }
109 else if (type == "name") { outputFileName = "rep.names"; }
110 else if (type == "count") { outputFileName = "rep.count_table"; }
111 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
113 return outputFileName;
115 catch(exception& e) {
116 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
120 //**********************************************************************************************************************
121 GetOTURepCommand::GetOTURepCommand(){
123 abort = true; calledHelp = true;
125 vector<string> tempOutNames;
126 outputTypes["fasta"] = tempOutNames;
127 outputTypes["name"] = tempOutNames;
128 outputTypes["count"] = tempOutNames;
130 catch(exception& e) {
131 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
135 //**********************************************************************************************************************
136 GetOTURepCommand::GetOTURepCommand(string option) {
138 abort = false; calledHelp = false;
141 //allow user to run help
142 if (option == "help") {
143 help(); abort = true; calledHelp = true;
144 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
146 vector<string> myArray = setParameters();
148 OptionParser parser(option);
149 map<string, string> parameters = parser.getParameters();
151 ValidParameters validParameter;
152 map<string, string>::iterator it;
154 //check to make sure all parameters are valid for command
155 for (it = parameters.begin(); it != parameters.end(); it++) {
156 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
159 //initialize outputTypes
160 vector<string> tempOutNames;
161 outputTypes["fasta"] = tempOutNames;
162 outputTypes["name"] = tempOutNames;
163 outputTypes["count"] = tempOutNames;
165 //if the user changes the input directory command factory will send this info to us in the output parameter
166 string inputDir = validParameter.validFile(parameters, "inputdir", false);
167 if (inputDir == "not found"){ inputDir = ""; }
170 it = parameters.find("list");
171 //user has given a template file
172 if(it != parameters.end()){
173 path = m->hasPath(it->second);
174 //if the user has not given a path then, add inputdir. else leave path alone.
175 if (path == "") { parameters["list"] = inputDir + it->second; }
178 it = parameters.find("fasta");
179 //user has given a template file
180 if(it != parameters.end()){
181 path = m->hasPath(it->second);
182 //if the user has not given a path then, add inputdir. else leave path alone.
183 if (path == "") { parameters["fasta"] = inputDir + it->second; }
186 it = parameters.find("phylip");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["phylip"] = inputDir + it->second; }
194 it = parameters.find("column");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["column"] = inputDir + it->second; }
202 it = parameters.find("name");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["name"] = inputDir + it->second; }
210 it = parameters.find("group");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["group"] = inputDir + it->second; }
218 it = parameters.find("count");
219 //user has given a template file
220 if(it != parameters.end()){
221 path = m->hasPath(it->second);
222 //if the user has not given a path then, add inputdir. else leave path alone.
223 if (path == "") { parameters["count"] = inputDir + it->second; }
228 //if the user changes the output directory command factory will send this info to us in the output parameter
229 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
231 //check for required parameters
232 fastafile = validParameter.validFile(parameters, "fasta", true);
233 if (fastafile == "not found") { fastafile = ""; }
234 else if (fastafile == "not open") { abort = true; }
235 else { m->setFastaFile(fastafile); }
237 listfile = validParameter.validFile(parameters, "list", true);
238 if (listfile == "not found") {
239 listfile = m->getListFile();
240 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
241 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
243 else if (listfile == "not open") { abort = true; }
244 else { m->setListFile(listfile); }
246 phylipfile = validParameter.validFile(parameters, "phylip", true);
247 if (phylipfile == "not found") { phylipfile = ""; }
248 else if (phylipfile == "not open") { abort = true; }
249 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
251 columnfile = validParameter.validFile(parameters, "column", true);
252 if (columnfile == "not found") { columnfile = ""; }
253 else if (columnfile == "not open") { abort = true; }
254 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
256 namefile = validParameter.validFile(parameters, "name", true);
257 if (namefile == "not open") { abort = true; }
258 else if (namefile == "not found") { namefile = ""; }
259 else { m->setNameFile(namefile); }
262 countfile = validParameter.validFile(parameters, "count", true);
263 if (countfile == "not found") { countfile = ""; }
264 else if (countfile == "not open") { abort = true; countfile = ""; }
266 m->setCountTableFile(countfile);
267 ct.readTable(countfile);
268 if (ct.hasGroupInfo()) { hasGroups = true; }
271 if ((namefile != "") && (countfile != "")) {
272 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
275 if ((groupfile != "") && (countfile != "")) {
276 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
279 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
280 //give priority to column, then phylip
281 columnfile = m->getColumnFile();
282 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
284 phylipfile = m->getPhylipFile();
285 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
287 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
291 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
293 if (columnfile != "") {
294 if ((namefile == "") && (countfile == "")) {
295 namefile = m->getNameFile();
296 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
298 countfile = m->getCountTableFile();
299 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
301 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
308 //check for optional parameter and set defaults
309 // ...at some point should added some additional type checking...
310 label = validParameter.validFile(parameters, "label", false);
311 if (label == "not found") { label = ""; allLines = 1; }
313 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
314 else { allLines = 1; }
317 groupfile = validParameter.validFile(parameters, "group", true);
318 if (groupfile == "not open") { groupfile = ""; abort = true; }
319 else if (groupfile == "not found") { groupfile = ""; }
320 else { m->setGroupFile(groupfile); }
322 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
323 if (sorted == "none") { sorted=""; }
324 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
325 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
329 if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) {
330 m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine();
334 groups = validParameter.validFile(parameters, "groups", false);
335 if (groups == "not found") { groups = ""; }
337 if ((groupfile == "") && (!hasGroups)) {
338 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
341 m->splitAtDash(groups, Groups);
344 m->setGroups(Groups);
346 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
347 large = m->isTrue(temp);
349 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
350 weighted = m->isTrue(temp);
352 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
354 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
355 m->mothurConvert(temp, precision);
357 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
358 m->mothurConvert(temp, cutoff);
359 cutoff += (5 / (precision * 10.0));
362 catch(exception& e) {
363 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
368 //**********************************************************************************************************************
370 int GetOTURepCommand::execute(){
373 if (abort == true) { if (calledHelp) { return 0; } return 2; }
379 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); } return 0; }
381 if (groupfile != "") {
382 //read in group map info.
383 groupMap = new GroupMap(groupfile);
384 int error = groupMap->readMap();
385 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
387 if (Groups.size() != 0) {
389 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
390 util.setGroups(Groups, gNamesOfGroups, "getoturep");
391 groupMap->setNamesOfGroups(gNamesOfGroups);
393 }else if (hasGroups) {
394 if (Groups.size() != 0) {
396 vector<string> gNamesOfGroups = ct.getNamesOfGroups();
397 util.setGroups(Groups, gNamesOfGroups, "getoturep");
401 //done with listvector from matrix
402 if (list != NULL) { delete list; }
404 input = new InputData(listfile, "list");
405 list = input->getListVector();
406 string lastLabel = list->getLabel();
408 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
409 set<string> processedLabels;
410 set<string> userLabels = labels;
412 if (m->control_pressed) {
413 if (large) { inRow.close(); m->mothurRemove(distFile); }
414 delete input; delete list; return 0;
417 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
419 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
421 if (allLines == 1 || labels.count(list->getLabel()) == 1){
422 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
423 error = process(list);
424 if (error == 1) { return 0; } //there is an error in hte input files, abort command
426 if (m->control_pressed) {
427 if (large) { inRow.close(); m->mothurRemove(distFile); }
428 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
429 delete input; delete list; return 0;
432 processedLabels.insert(list->getLabel());
433 userLabels.erase(list->getLabel());
436 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
437 string saveLabel = list->getLabel();
440 list = input->getListVector(lastLabel);
441 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
442 error = process(list);
443 if (error == 1) { return 0; } //there is an error in hte input files, abort command
445 if (m->control_pressed) {
446 if (large) { inRow.close(); m->mothurRemove(distFile); }
447 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
448 delete input; delete list; return 0;
451 processedLabels.insert(list->getLabel());
452 userLabels.erase(list->getLabel());
454 //restore real lastlabel to save below
455 list->setLabel(saveLabel);
458 lastLabel = list->getLabel();
461 list = input->getListVector();
464 //output error messages about any remaining user labels
465 bool needToRun = false;
466 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
467 m->mothurOut("Your file does not include the label " + (*it));
468 if (processedLabels.count(lastLabel) != 1) {
469 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
472 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
476 //run last label if you need to
477 if (needToRun == true) {
478 if (list != NULL) { delete list; }
479 list = input->getListVector(lastLabel);
480 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
481 error = process(list);
483 if (error == 1) { return 0; } //there is an error in hte input files, abort command
485 if (m->control_pressed) {
486 if (large) { inRow.close(); m->mothurRemove(distFile); }
487 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
488 delete input; delete list; return 0;
492 //close and remove formatted matrix file
495 m->mothurRemove(distFile);
500 if (!weighted) { nameFileMap.clear(); }
503 if (fastafile != "") {
505 fasta = new FastaMap();
506 fasta->readFastaFile(fastafile);
508 //if user gave a namesfile then use it
509 if (namefile != "") { readNamesFile(); }
511 //output create and output the .rep.fasta files
512 map<string, string>::iterator itNameFile;
513 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
514 processFastaNames(itNameFile->first, itNameFile->second);
517 //output create and output the .rep.fasta files
518 map<string, string>::iterator itNameFile;
519 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
520 processNames(itNameFile->first, itNameFile->second);
525 if (groupfile != "") { delete groupMap; }
527 if (m->control_pressed) { return 0; }
529 //set fasta file as new current fastafile - use first one??
531 itTypes = outputTypes.find("fasta");
532 if (itTypes != outputTypes.end()) {
533 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
536 itTypes = outputTypes.find("name");
537 if (itTypes != outputTypes.end()) {
538 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
541 itTypes = outputTypes.find("count");
542 if (itTypes != outputTypes.end()) {
543 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
546 m->mothurOutEndLine();
547 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
548 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
549 m->mothurOutEndLine();
553 catch(exception& e) {
554 m->errorOut(e, "GetOTURepCommand", "execute");
558 //**********************************************************************************************************************
559 int GetOTURepCommand::readDist() {
563 //read distance files
564 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
565 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
566 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
568 readMatrix->setCutoff(cutoff);
570 NameAssignment* nameMap = NULL;
572 nameMap = new NameAssignment(namefile);
574 readMatrix->read(nameMap);
575 }else if (countfile != "") {
576 readMatrix->read(&ct);
579 if (m->control_pressed) { delete readMatrix; return 0; }
581 list = readMatrix->getListVector();
582 SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
584 // Create a data structure to quickly access the distance information.
585 // It consists of a vector of distance maps, where each map contains
586 // all distances of a certain sequence. Vector and maps are accessed
587 // via the index of a sequence in the distance matrix
588 seqVec = vector<SeqMap>(list->size());
589 for (int i = 0; i < matrix->seqVec.size(); i++) {
590 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
591 if (m->control_pressed) { delete readMatrix; return 0; }
592 //already added everyone else in row
593 if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
596 //add dummy map for unweighted calc
598 seqVec.push_back(dummy);
604 if (m->control_pressed) { return 0; }
606 //process file and set up indexes
607 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
608 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
609 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
611 formatMatrix->setCutoff(cutoff);
613 NameAssignment* nameMap = NULL;
615 nameMap = new NameAssignment(namefile);
617 readMatrix->read(nameMap);
618 }else if (countfile != "") {
619 readMatrix->read(&ct);
622 if (m->control_pressed) { delete formatMatrix; return 0; }
624 list = formatMatrix->getListVector();
625 distFile = formatMatrix->getFormattedFileName();
627 //positions in file where the distances for each sequence begin
628 //rowPositions[1] = position in file where distance related to sequence 1 start.
629 rowPositions = formatMatrix->getRowPositions();
630 rowPositions.push_back(-1); //dummy row for unweighted calc
635 //openfile for getMap to use
636 m->openInputFile(distFile, inRow);
638 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
642 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
644 vector<string> names;
646 //map names to rows in sparsematrix
647 for (int i = 0; i < list->size(); i++) {
649 binnames = list->get(i);
651 m->splitAtComma(binnames, names);
653 for (int j = 0; j < names.size(); j++) {
654 nameToIndex[names[j]] = i;
657 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
659 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; }
663 catch(exception& e) {
664 m->errorOut(e, "GetOTURepCommand", "execute");
668 //**********************************************************************************************************************
669 void GetOTURepCommand::readNamesFile() {
672 vector<string> dupNames;
673 m->openInputFile(namefile, in);
675 string name, names, sequence;
678 in >> name; //read from first column A
679 in >> names; //read from second column A,B,C,D
683 //parse names into vector
684 m->splitAtComma(names, dupNames);
686 //store names in fasta map
687 sequence = fasta->getSequence(name);
688 for (int i = 0; i < dupNames.size(); i++) {
689 fasta->push_back(dupNames[i], sequence);
697 catch(exception& e) {
698 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
702 //**********************************************************************************************************************
703 //read names file to find the weighted rep for each bin
704 void GetOTURepCommand::readNamesFile(bool w) {
707 vector<string> dupNames;
708 m->openInputFile(namefile, in);
710 string name, names, sequence;
713 in >> name; m->gobble(in); //read from first column A
714 in >> names; //read from second column A,B,C,D
718 //parse names into vector
719 m->splitAtComma(names, dupNames);
721 for (int i = 0; i < dupNames.size(); i++) {
722 nameFileMap[dupNames[i]] = name;
730 catch(exception& e) {
731 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
735 //**********************************************************************************************************************
736 string GetOTURepCommand::findRep(vector<string> names, string group) {
738 // if only 1 sequence in bin or processing the "unique" label, then
739 // the first sequence of the OTU is the representative one
740 if ((names.size() == 1)) {
743 vector<int> seqIndex; //(names.size());
744 map<string, string>::iterator itNameFile;
745 map<string, int>::iterator itNameIndex;
747 //fill seqIndex and initialize sums
748 for (size_t i = 0; i < names.size(); i++) {
750 seqIndex.push_back(nameToIndex[names[i]]);
751 if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
753 if (group != "") { numRep = ct.getGroupCount(names[i], group); }
754 else { numRep = ct.getGroupCount(names[i]); }
755 for (int j = 1; j < numRep; j++) { //don't add yourself again
756 seqIndex.push_back(nameToIndex[names[i]]);
760 if (namefile == "") {
761 itNameIndex = nameToIndex.find(names[i]);
763 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
764 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
765 else { seqIndex.push_back((seqVec.size()-1)); }
767 seqIndex.push_back(itNameIndex->second);
771 itNameFile = nameFileMap.find(names[i]);
773 if (itNameFile == nameFileMap.end()) {
774 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
776 string name1 = itNameFile->first;
777 string name2 = itNameFile->second;
779 if (name1 == name2) { //then you are unique so add your real dists
780 seqIndex.push_back(nameToIndex[names[i]]);
782 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
783 else { seqIndex.push_back((seqVec.size()-1)); }
790 vector<float> max_dist(seqIndex.size(), 0.0);
791 vector<float> total_dist(seqIndex.size(), 0.0);
793 // loop through all entries in seqIndex
796 for (size_t i=0; i < seqIndex.size(); i++) {
797 if (m->control_pressed) { return "control"; }
799 if (!large) { currMap = seqVec[seqIndex[i]]; }
800 else { currMap = getMap(seqIndex[i]); }
802 for (size_t j=0; j < seqIndex.size(); j++) {
803 it = currMap.find(seqIndex[j]);
804 if (it != currMap.end()) {
805 max_dist[i] = max(max_dist[i], it->second);
806 max_dist[j] = max(max_dist[j], it->second);
807 total_dist[i] += it->second;
808 total_dist[j] += it->second;
809 }else{ //if you can't find the distance make it the cutoff
810 max_dist[i] = max(max_dist[i], cutoff);
811 max_dist[j] = max(max_dist[j], cutoff);
812 total_dist[i] += cutoff;
813 total_dist[j] += cutoff;
818 // sequence with the smallest maximum distance is the representative
819 //if tie occurs pick sequence with smallest average distance
822 for (size_t i=0; i < max_dist.size(); i++) {
823 if (m->control_pressed) { return "control"; }
824 if (max_dist[i] < min) {
827 }else if (max_dist[i] == min) {
828 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
829 float newAverage = total_dist[i] / (float) total_dist.size();
831 if (newAverage < currentAverage) {
838 return(names[minIndex]);
841 catch(exception& e) {
842 m->errorOut(e, "GetOTURepCommand", "FindRep");
847 //**********************************************************************************************************************
848 int GetOTURepCommand::process(ListVector* processList) {
850 string name, sequence;
854 if (outputDir == "") { outputDir += m->hasPath(listfile); }
856 ofstream newNamesOutput;
857 string outputNamesFile;
858 map<string, ofstream*> filehandles;
860 if (Groups.size() == 0) { //you don't want to use groups
861 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".";
862 if (countfile == "") {
863 outputNamesFile += getOutputFileNameTag("name");
864 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
866 outputNamesFile += getOutputFileNameTag("count");
867 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
869 outputNameFiles[outputNamesFile] = processList->getLabel();
870 m->openOutputFile(outputNamesFile, newNamesOutput);
871 newNamesOutput << "noGroup" << endl;
872 }else{ //you want to use groups
874 for (int i=0; i<Groups.size(); i++) {
876 filehandles[Groups[i]] = temp;
877 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
878 if (countfile == "") {
879 outputNamesFile += getOutputFileNameTag("name");
880 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
882 outputNamesFile += getOutputFileNameTag("count");
883 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
886 m->openOutputFile(outputNamesFile, *(temp));
887 *(temp) << Groups[i] << endl;
888 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
892 //for each bin in the list vector
893 for (int i = 0; i < processList->size(); i++) {
894 if (m->control_pressed) {
896 if (Groups.size() == 0) { //you don't want to use groups
897 newNamesOutput.close();
899 for (int j=0; j<Groups.size(); j++) {
900 (*(filehandles[Groups[j]])).close();
901 delete filehandles[Groups[j]];
907 string temp = processList->get(i);
908 vector<string> namesInBin;
909 m->splitAtComma(temp, namesInBin);
911 if (Groups.size() == 0) {
912 nameRep = findRep(namesInBin, "");
913 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
915 map<string, vector<string> > NamesInGroup;
916 for (int j=0; j<Groups.size(); j++) { //initialize groups
917 NamesInGroup[Groups[j]].resize(0);
920 for (int j=0; j<namesInBin.size(); j++) {
921 if (groupfile != "") {
922 string thisgroup = groupMap->getGroup(namesInBin[j]);
923 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
925 //add this name to correct group
926 if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); }
928 vector<string> thisSeqsGroups = ct.getGroups(namesInBin[j]);
929 for (int k = 0; k < thisSeqsGroups.size(); k++) {
930 if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); }
935 //get rep for each group in otu
936 for (int j=0; j<Groups.size(); j++) {
937 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
938 //get rep for each group
939 nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]);
941 //output group rep and other members of this group
942 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
944 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
945 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
948 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
954 if (Groups.size() == 0) { //you don't want to use groups
955 newNamesOutput.close();
957 for (int i=0; i<Groups.size(); i++) {
958 (*(filehandles[Groups[i]])).close();
959 delete filehandles[Groups[i]];
966 catch(exception& e) {
967 m->errorOut(e, "GetOTURepCommand", "process");
971 //**********************************************************************************************************************
972 int GetOTURepCommand::processFastaNames(string filename, string label) {
976 if (outputDir == "") { outputDir += m->hasPath(listfile); }
977 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
978 m->openOutputFile(outputFileName, out);
979 vector<repStruct> reps;
980 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
983 string tempNameFile = filename + ".temp";
984 m->openOutputFile(tempNameFile, out2);
987 m->openInputFile(filename, in);
990 string tempGroup = "";
991 in >> tempGroup; m->gobble(in);
994 if (countfile != "") {
995 thisCt.readTable(countfile);
996 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1001 string rep, binnames;
1002 in >> i >> rep >> binnames; m->gobble(in);
1004 vector<string> names;
1005 m->splitAtComma(binnames, names);
1006 int binsize = names.size();
1008 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1010 if (tempGroup == "noGroup") {
1011 for (int j = 0; j < names.size(); j++) {
1012 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1014 binsize = thisCt.getNumSeqs(rep);
1017 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1018 out2 << rep << '\t' << total << '\t' << total << endl;
1022 thistotal += binsize;
1023 //if you have a groupfile
1025 map<string, string> groups;
1026 map<string, string>::iterator groupIt;
1027 if (groupfile != "") {
1028 //find the groups that are in this bin
1029 for (int i = 0; i < names.size(); i++) {
1030 string groupName = groupMap->getGroup(names[i]);
1031 if (groupName == "not found") {
1032 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
1035 groups[groupName] = groupName;
1039 //turn the groups into a string
1040 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1041 group += groupIt->first + "-";
1044 group = group.substr(0, group.length()-1);
1045 }else if (hasGroups) {
1046 map<string, string> groups;
1047 for (int i = 0; i < names.size(); i++) {
1048 vector<string> thisSeqsGroups = ct.getGroups(names[i]);
1049 for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; }
1051 //turn the groups into a string
1052 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1053 group += groupIt->first + "-";
1056 group = group.substr(0, group.length()-1);
1057 //cout << group << endl;
1062 //print out name and sequence for that bin
1063 string sequence = fasta->getSequence(rep);
1065 if (sequence != "not found") {
1066 if (sorted == "") { //print them out
1067 rep = rep + "\t" + toString(i+1);
1068 rep = rep + "|" + toString(binsize);
1070 rep = rep + "|" + group;
1072 out << ">" << rep << endl;
1073 out << sequence << endl;
1075 repStruct newRep(rep, i+1, binsize, group);
1076 reps.push_back(newRep);
1079 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
1084 if (sorted != "") { //then sort them and print them
1085 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
1086 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
1087 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
1088 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
1091 for (int i = 0; i < reps.size(); i++) {
1092 string sequence = fasta->getSequence(reps[i].name);
1093 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
1094 outputName = outputName + "|" + toString(reps[i].size);
1095 if (reps[i].group != "") {
1096 outputName = outputName + "|" + reps[i].group;
1098 out << ">" << outputName << endl;
1099 out << sequence << endl;
1107 m->mothurRemove(filename);
1108 rename(tempNameFile.c_str(), filename.c_str());
1110 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1115 catch(exception& e) {
1116 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
1120 //**********************************************************************************************************************
1121 int GetOTURepCommand::processNames(string filename, string label) {
1124 //create output file
1125 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1128 string tempNameFile = filename + ".temp";
1129 m->openOutputFile(tempNameFile, out2);
1132 m->openInputFile(filename, in);
1135 string rep, binnames;
1137 string tempGroup = "";
1138 in >> tempGroup; m->gobble(in);
1141 if (countfile != "") {
1142 thisCt.readTable(countfile);
1143 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1147 if (m->control_pressed) { break; }
1148 in >> i >> rep >> binnames; m->gobble(in);
1150 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1152 vector<string> names;
1153 m->splitAtComma(binnames, names);
1154 if (tempGroup == "noGroup") {
1155 for (int j = 0; j < names.size(); j++) {
1156 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1160 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1161 out2 << rep << '\t' << total << '\t' << total << endl;
1169 m->mothurRemove(filename);
1170 rename(tempNameFile.c_str(), filename.c_str());
1172 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1176 catch(exception& e) {
1177 m->errorOut(e, "GetOTURepCommand", "processNames");
1181 //**********************************************************************************************************************
1182 SeqMap GetOTURepCommand::getMap(int row) {
1186 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1187 if (rowPositions[row] != -1){
1189 inRow.seekg(rowPositions[row]);
1191 int rowNum, numDists, colNum;
1194 inRow >> rowNum >> numDists;
1196 for(int i = 0; i < numDists; i++) {
1197 inRow >> colNum >> dist;
1198 rowMap[colNum] = dist;
1205 catch(exception& e) {
1206 m->errorOut(e, "GetOTURepCommand", "getMap");
1210 //**********************************************************************************************************************