5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pfasta);
44 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none",false,false); parameters.push_back(pphylip);
45 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName",false,false); parameters.push_back(pname);
46 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName",false,false); parameters.push_back(pcount);
47 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
48 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName",false,false); parameters.push_back(pcolumn);
49 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
50 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
51 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "",false,false); parameters.push_back(pcutoff);
52 CommandParameter pprecision("precision", "Number", "", "100", "", "", "",false,false); parameters.push_back(pprecision);
53 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pweighted);
54 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "",false,false); parameters.push_back(psorted);
55 CommandParameter plarge("large", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(plarge);
56 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
57 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
59 vector<string> myArray;
60 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
64 m->errorOut(e, "GetOTURepCommand", "setParameters");
68 //**********************************************************************************************************************
69 string GetOTURepCommand::getHelpString(){
71 string helpString = "";
72 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
73 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
74 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name or count filename is required. \n";
75 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
76 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
77 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
78 helpString += "The default value for label is all labels in your inputfile.\n";
79 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
80 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
81 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
82 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
83 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
84 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
85 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
86 helpString += "The group parameter allows you provide a group file.\n";
87 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
88 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
89 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
90 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
94 m->errorOut(e, "GetOTURepCommand", "getHelpString");
98 //**********************************************************************************************************************
99 string GetOTURepCommand::getOutputFileNameTag(string type, string inputName=""){
101 string outputFileName = "";
102 map<string, vector<string> >::iterator it;
104 //is this a type this command creates
105 it = outputTypes.find(type);
106 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
108 if (type == "fasta") { outputFileName = "rep.fasta"; }
109 else if (type == "name") { outputFileName = "rep.names"; }
110 else if (type == "count") { outputFileName = "rep.count_table"; }
111 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
113 return outputFileName;
115 catch(exception& e) {
116 m->errorOut(e, "GetOTURepCommand", "getOutputFileNameTag");
120 //**********************************************************************************************************************
121 GetOTURepCommand::GetOTURepCommand(){
123 abort = true; calledHelp = true;
125 vector<string> tempOutNames;
126 outputTypes["fasta"] = tempOutNames;
127 outputTypes["name"] = tempOutNames;
128 outputTypes["count"] = tempOutNames;
130 catch(exception& e) {
131 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
135 //**********************************************************************************************************************
136 GetOTURepCommand::GetOTURepCommand(string option) {
138 abort = false; calledHelp = false;
141 //allow user to run help
142 if (option == "help") {
143 help(); abort = true; calledHelp = true;
144 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
146 vector<string> myArray = setParameters();
148 OptionParser parser(option);
149 map<string, string> parameters = parser.getParameters();
151 ValidParameters validParameter;
152 map<string, string>::iterator it;
154 //check to make sure all parameters are valid for command
155 for (it = parameters.begin(); it != parameters.end(); it++) {
156 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
159 //initialize outputTypes
160 vector<string> tempOutNames;
161 outputTypes["fasta"] = tempOutNames;
162 outputTypes["name"] = tempOutNames;
163 outputTypes["count"] = tempOutNames;
165 //if the user changes the input directory command factory will send this info to us in the output parameter
166 string inputDir = validParameter.validFile(parameters, "inputdir", false);
167 if (inputDir == "not found"){ inputDir = ""; }
170 it = parameters.find("list");
171 //user has given a template file
172 if(it != parameters.end()){
173 path = m->hasPath(it->second);
174 //if the user has not given a path then, add inputdir. else leave path alone.
175 if (path == "") { parameters["list"] = inputDir + it->second; }
178 it = parameters.find("fasta");
179 //user has given a template file
180 if(it != parameters.end()){
181 path = m->hasPath(it->second);
182 //if the user has not given a path then, add inputdir. else leave path alone.
183 if (path == "") { parameters["fasta"] = inputDir + it->second; }
186 it = parameters.find("phylip");
187 //user has given a template file
188 if(it != parameters.end()){
189 path = m->hasPath(it->second);
190 //if the user has not given a path then, add inputdir. else leave path alone.
191 if (path == "") { parameters["phylip"] = inputDir + it->second; }
194 it = parameters.find("column");
195 //user has given a template file
196 if(it != parameters.end()){
197 path = m->hasPath(it->second);
198 //if the user has not given a path then, add inputdir. else leave path alone.
199 if (path == "") { parameters["column"] = inputDir + it->second; }
202 it = parameters.find("name");
203 //user has given a template file
204 if(it != parameters.end()){
205 path = m->hasPath(it->second);
206 //if the user has not given a path then, add inputdir. else leave path alone.
207 if (path == "") { parameters["name"] = inputDir + it->second; }
210 it = parameters.find("group");
211 //user has given a template file
212 if(it != parameters.end()){
213 path = m->hasPath(it->second);
214 //if the user has not given a path then, add inputdir. else leave path alone.
215 if (path == "") { parameters["group"] = inputDir + it->second; }
218 it = parameters.find("count");
219 //user has given a template file
220 if(it != parameters.end()){
221 path = m->hasPath(it->second);
222 //if the user has not given a path then, add inputdir. else leave path alone.
223 if (path == "") { parameters["count"] = inputDir + it->second; }
228 //if the user changes the output directory command factory will send this info to us in the output parameter
229 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
231 //check for required parameters
232 fastafile = validParameter.validFile(parameters, "fasta", true);
233 if (fastafile == "not found") { fastafile = ""; }
234 else if (fastafile == "not open") { abort = true; }
235 else { m->setFastaFile(fastafile); }
237 listfile = validParameter.validFile(parameters, "list", true);
238 if (listfile == "not found") {
239 listfile = m->getListFile();
240 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
241 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
243 else if (listfile == "not open") { abort = true; }
244 else { m->setListFile(listfile); }
246 phylipfile = validParameter.validFile(parameters, "phylip", true);
247 if (phylipfile == "not found") { phylipfile = ""; }
248 else if (phylipfile == "not open") { abort = true; }
249 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
251 columnfile = validParameter.validFile(parameters, "column", true);
252 if (columnfile == "not found") { columnfile = ""; }
253 else if (columnfile == "not open") { abort = true; }
254 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
256 namefile = validParameter.validFile(parameters, "name", true);
257 if (namefile == "not open") { abort = true; }
258 else if (namefile == "not found") { namefile = ""; }
259 else { m->setNameFile(namefile); }
262 countfile = validParameter.validFile(parameters, "count", true);
263 if (countfile == "not found") { countfile = ""; }
264 else if (countfile == "not open") { abort = true; countfile = ""; }
266 m->setCountTableFile(countfile);
267 ct.readTable(countfile);
268 if (ct.hasGroupInfo()) { hasGroups = true; }
271 if ((namefile != "") && (countfile != "")) {
272 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
275 if ((groupfile != "") && (countfile != "")) {
276 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
279 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
280 //give priority to column, then phylip
281 columnfile = m->getColumnFile();
282 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
284 phylipfile = m->getPhylipFile();
285 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
287 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
291 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
293 if (columnfile != "") {
294 if ((namefile == "") && (countfile == "")) {
295 namefile = m->getNameFile();
296 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
298 countfile = m->getCountTableFile();
299 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
301 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
308 //check for optional parameter and set defaults
309 // ...at some point should added some additional type checking...
310 label = validParameter.validFile(parameters, "label", false);
311 if (label == "not found") { label = ""; allLines = 1; }
313 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
314 else { allLines = 1; }
317 groupfile = validParameter.validFile(parameters, "group", true);
318 if (groupfile == "not open") { groupfile = ""; abort = true; }
319 else if (groupfile == "not found") { groupfile = ""; }
320 else { m->setGroupFile(groupfile); }
322 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
323 if (sorted == "none") { sorted=""; }
324 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
325 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
329 if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) {
330 m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine();
334 groups = validParameter.validFile(parameters, "groups", false);
335 if (groups == "not found") { groups = ""; }
337 if ((groupfile == "") && (!hasGroups)) {
338 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
341 m->splitAtDash(groups, Groups);
344 m->setGroups(Groups);
346 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
347 large = m->isTrue(temp);
349 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
350 weighted = m->isTrue(temp);
352 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
354 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
355 m->mothurConvert(temp, precision);
357 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
358 m->mothurConvert(temp, cutoff);
359 cutoff += (5 / (precision * 10.0));
362 catch(exception& e) {
363 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
368 //**********************************************************************************************************************
370 int GetOTURepCommand::execute(){
373 if (abort == true) { if (calledHelp) { return 0; } return 2; }
379 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); } return 0; }
381 if (groupfile != "") {
382 //read in group map info.
383 groupMap = new GroupMap(groupfile);
384 int error = groupMap->readMap();
385 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
387 if (Groups.size() != 0) {
389 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
390 util.setGroups(Groups, gNamesOfGroups, "getoturep");
391 groupMap->setNamesOfGroups(gNamesOfGroups);
393 }else if (hasGroups) {
394 if (Groups.size() != 0) {
396 vector<string> gNamesOfGroups = ct.getNamesOfGroups();
397 util.setGroups(Groups, gNamesOfGroups, "getoturep");
401 //done with listvector from matrix
402 if (list != NULL) { delete list; }
404 input = new InputData(listfile, "list");
405 list = input->getListVector();
406 string lastLabel = list->getLabel();
408 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
409 set<string> processedLabels;
410 set<string> userLabels = labels;
412 if (m->control_pressed) {
413 if (large) { inRow.close(); m->mothurRemove(distFile); }
414 delete input; delete list; return 0;
417 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
419 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
421 if (allLines == 1 || labels.count(list->getLabel()) == 1){
422 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
423 error = process(list);
424 if (error == 1) { return 0; } //there is an error in hte input files, abort command
426 if (m->control_pressed) {
427 if (large) { inRow.close(); m->mothurRemove(distFile); }
428 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
429 delete input; delete list; return 0;
432 processedLabels.insert(list->getLabel());
433 userLabels.erase(list->getLabel());
436 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
437 string saveLabel = list->getLabel();
440 list = input->getListVector(lastLabel);
441 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
442 error = process(list);
443 if (error == 1) { return 0; } //there is an error in hte input files, abort command
445 if (m->control_pressed) {
446 if (large) { inRow.close(); m->mothurRemove(distFile); }
447 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
448 delete input; delete list; return 0;
451 processedLabels.insert(list->getLabel());
452 userLabels.erase(list->getLabel());
454 //restore real lastlabel to save below
455 list->setLabel(saveLabel);
458 lastLabel = list->getLabel();
461 list = input->getListVector();
464 //output error messages about any remaining user labels
465 bool needToRun = false;
466 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
467 m->mothurOut("Your file does not include the label " + (*it));
468 if (processedLabels.count(lastLabel) != 1) {
469 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
472 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
476 //run last label if you need to
477 if (needToRun == true) {
478 if (list != NULL) { delete list; }
479 list = input->getListVector(lastLabel);
480 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
481 error = process(list);
483 if (error == 1) { return 0; } //there is an error in hte input files, abort command
485 if (m->control_pressed) {
486 if (large) { inRow.close(); m->mothurRemove(distFile); }
487 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
488 delete input; delete list; return 0;
492 //close and remove formatted matrix file
495 m->mothurRemove(distFile);
500 if (!weighted) { nameFileMap.clear(); }
503 if (fastafile != "") {
505 fasta = new FastaMap();
506 fasta->readFastaFile(fastafile);
508 //if user gave a namesfile then use it
509 if (namefile != "") { readNamesFile(); }
511 //output create and output the .rep.fasta files
512 map<string, string>::iterator itNameFile;
513 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
514 processFastaNames(itNameFile->first, itNameFile->second);
517 //output create and output the .rep.fasta files
518 map<string, string>::iterator itNameFile;
519 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
520 processNames(itNameFile->first, itNameFile->second);
525 if (groupfile != "") { delete groupMap; }
527 if (m->control_pressed) { return 0; }
529 //set fasta file as new current fastafile - use first one??
531 itTypes = outputTypes.find("fasta");
532 if (itTypes != outputTypes.end()) {
533 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
536 itTypes = outputTypes.find("name");
537 if (itTypes != outputTypes.end()) {
538 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
541 itTypes = outputTypes.find("count");
542 if (itTypes != outputTypes.end()) {
543 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
546 m->mothurOutEndLine();
547 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
548 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
549 m->mothurOutEndLine();
553 catch(exception& e) {
554 m->errorOut(e, "GetOTURepCommand", "execute");
558 //**********************************************************************************************************************
559 int GetOTURepCommand::readDist() {
563 //read distance files
564 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
565 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
566 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
568 readMatrix->setCutoff(cutoff);
570 NameAssignment* nameMap = NULL;
572 nameMap = new NameAssignment(namefile);
574 readMatrix->read(nameMap);
575 }else if (countfile != "") {
576 readMatrix->read(&ct);
578 readMatrix->read(nameMap);
581 if (m->control_pressed) { delete readMatrix; return 0; }
583 list = readMatrix->getListVector();
584 SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
586 // Create a data structure to quickly access the distance information.
587 // It consists of a vector of distance maps, where each map contains
588 // all distances of a certain sequence. Vector and maps are accessed
589 // via the index of a sequence in the distance matrix
590 seqVec = vector<SeqMap>(list->size());
591 for (int i = 0; i < matrix->seqVec.size(); i++) {
592 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
593 if (m->control_pressed) { delete readMatrix; return 0; }
594 //already added everyone else in row
595 if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
598 //add dummy map for unweighted calc
600 seqVec.push_back(dummy);
606 if (m->control_pressed) { return 0; }
608 //process file and set up indexes
609 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
610 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
611 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
613 formatMatrix->setCutoff(cutoff);
615 NameAssignment* nameMap = NULL;
617 nameMap = new NameAssignment(namefile);
619 formatMatrix->read(nameMap);
620 }else if (countfile != "") {
621 formatMatrix->read(&ct);
623 formatMatrix->read(nameMap);
626 if (m->control_pressed) { delete formatMatrix; return 0; }
628 list = formatMatrix->getListVector();
629 distFile = formatMatrix->getFormattedFileName();
631 //positions in file where the distances for each sequence begin
632 //rowPositions[1] = position in file where distance related to sequence 1 start.
633 rowPositions = formatMatrix->getRowPositions();
634 rowPositions.push_back(-1); //dummy row for unweighted calc
639 //openfile for getMap to use
640 m->openInputFile(distFile, inRow);
642 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
646 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
648 vector<string> names;
650 //map names to rows in sparsematrix
651 for (int i = 0; i < list->size(); i++) {
653 binnames = list->get(i);
655 m->splitAtComma(binnames, names);
657 for (int j = 0; j < names.size(); j++) {
658 nameToIndex[names[j]] = i;
661 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
663 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; }
667 catch(exception& e) {
668 m->errorOut(e, "GetOTURepCommand", "execute");
672 //**********************************************************************************************************************
673 void GetOTURepCommand::readNamesFile() {
676 vector<string> dupNames;
677 m->openInputFile(namefile, in);
679 string name, names, sequence;
682 in >> name; //read from first column A
683 in >> names; //read from second column A,B,C,D
687 //parse names into vector
688 m->splitAtComma(names, dupNames);
690 //store names in fasta map
691 sequence = fasta->getSequence(name);
692 for (int i = 0; i < dupNames.size(); i++) {
693 fasta->push_back(dupNames[i], sequence);
701 catch(exception& e) {
702 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
706 //**********************************************************************************************************************
707 //read names file to find the weighted rep for each bin
708 void GetOTURepCommand::readNamesFile(bool w) {
711 vector<string> dupNames;
712 m->openInputFile(namefile, in);
714 string name, names, sequence;
717 in >> name; m->gobble(in); //read from first column A
718 in >> names; //read from second column A,B,C,D
722 //parse names into vector
723 m->splitAtComma(names, dupNames);
725 for (int i = 0; i < dupNames.size(); i++) {
726 nameFileMap[dupNames[i]] = name;
734 catch(exception& e) {
735 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
739 //**********************************************************************************************************************
740 string GetOTURepCommand::findRep(vector<string> names, string group) {
742 // if only 1 sequence in bin or processing the "unique" label, then
743 // the first sequence of the OTU is the representative one
744 if ((names.size() == 1)) {
747 vector<int> seqIndex; //(names.size());
748 map<string, string>::iterator itNameFile;
749 map<string, int>::iterator itNameIndex;
751 //fill seqIndex and initialize sums
752 for (size_t i = 0; i < names.size(); i++) {
754 seqIndex.push_back(nameToIndex[names[i]]);
755 if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
757 if (group != "") { numRep = ct.getGroupCount(names[i], group); }
758 else { numRep = ct.getGroupCount(names[i]); }
759 for (int j = 1; j < numRep; j++) { //don't add yourself again
760 seqIndex.push_back(nameToIndex[names[i]]);
764 if (namefile == "") {
765 itNameIndex = nameToIndex.find(names[i]);
767 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
768 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
769 else { seqIndex.push_back((seqVec.size()-1)); }
771 seqIndex.push_back(itNameIndex->second);
775 itNameFile = nameFileMap.find(names[i]);
777 if (itNameFile == nameFileMap.end()) {
778 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
780 string name1 = itNameFile->first;
781 string name2 = itNameFile->second;
783 if (name1 == name2) { //then you are unique so add your real dists
784 seqIndex.push_back(nameToIndex[names[i]]);
786 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
787 else { seqIndex.push_back((seqVec.size()-1)); }
794 vector<float> max_dist(seqIndex.size(), 0.0);
795 vector<float> total_dist(seqIndex.size(), 0.0);
797 // loop through all entries in seqIndex
800 for (size_t i=0; i < seqIndex.size(); i++) {
801 if (m->control_pressed) { return "control"; }
803 if (!large) { currMap = seqVec[seqIndex[i]]; }
804 else { currMap = getMap(seqIndex[i]); }
806 for (size_t j=0; j < seqIndex.size(); j++) {
807 it = currMap.find(seqIndex[j]);
808 if (it != currMap.end()) {
809 max_dist[i] = max(max_dist[i], it->second);
810 max_dist[j] = max(max_dist[j], it->second);
811 total_dist[i] += it->second;
812 total_dist[j] += it->second;
813 }else{ //if you can't find the distance make it the cutoff
814 max_dist[i] = max(max_dist[i], cutoff);
815 max_dist[j] = max(max_dist[j], cutoff);
816 total_dist[i] += cutoff;
817 total_dist[j] += cutoff;
822 // sequence with the smallest maximum distance is the representative
823 //if tie occurs pick sequence with smallest average distance
826 for (size_t i=0; i < max_dist.size(); i++) {
827 if (m->control_pressed) { return "control"; }
828 if (max_dist[i] < min) {
831 }else if (max_dist[i] == min) {
832 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
833 float newAverage = total_dist[i] / (float) total_dist.size();
835 if (newAverage < currentAverage) {
842 return(names[minIndex]);
845 catch(exception& e) {
846 m->errorOut(e, "GetOTURepCommand", "FindRep");
851 //**********************************************************************************************************************
852 int GetOTURepCommand::process(ListVector* processList) {
854 string name, sequence;
858 if (outputDir == "") { outputDir += m->hasPath(listfile); }
860 ofstream newNamesOutput;
861 string outputNamesFile;
862 map<string, ofstream*> filehandles;
864 if (Groups.size() == 0) { //you don't want to use groups
865 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + ".";
866 if (countfile == "") {
867 outputNamesFile += getOutputFileNameTag("name");
868 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
870 outputNamesFile += getOutputFileNameTag("count");
871 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
873 outputNameFiles[outputNamesFile] = processList->getLabel();
874 m->openOutputFile(outputNamesFile, newNamesOutput);
875 newNamesOutput << "noGroup" << endl;
876 }else{ //you want to use groups
878 for (int i=0; i<Groups.size(); i++) {
880 filehandles[Groups[i]] = temp;
881 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
882 if (countfile == "") {
883 outputNamesFile += getOutputFileNameTag("name");
884 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
886 outputNamesFile += getOutputFileNameTag("count");
887 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
890 m->openOutputFile(outputNamesFile, *(temp));
891 *(temp) << Groups[i] << endl;
892 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
896 //for each bin in the list vector
897 for (int i = 0; i < processList->size(); i++) {
898 if (m->control_pressed) {
900 if (Groups.size() == 0) { //you don't want to use groups
901 newNamesOutput.close();
903 for (int j=0; j<Groups.size(); j++) {
904 (*(filehandles[Groups[j]])).close();
905 delete filehandles[Groups[j]];
911 string temp = processList->get(i);
912 vector<string> namesInBin;
913 m->splitAtComma(temp, namesInBin);
915 if (Groups.size() == 0) {
916 nameRep = findRep(namesInBin, "");
917 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
919 map<string, vector<string> > NamesInGroup;
920 for (int j=0; j<Groups.size(); j++) { //initialize groups
921 NamesInGroup[Groups[j]].resize(0);
924 for (int j=0; j<namesInBin.size(); j++) {
925 if (groupfile != "") {
926 string thisgroup = groupMap->getGroup(namesInBin[j]);
927 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
929 //add this name to correct group
930 if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); }
932 vector<string> thisSeqsGroups = ct.getGroups(namesInBin[j]);
933 for (int k = 0; k < thisSeqsGroups.size(); k++) {
934 if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); }
939 //get rep for each group in otu
940 for (int j=0; j<Groups.size(); j++) {
941 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
942 //get rep for each group
943 nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]);
945 //output group rep and other members of this group
946 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
948 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
949 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
952 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
958 if (Groups.size() == 0) { //you don't want to use groups
959 newNamesOutput.close();
961 for (int i=0; i<Groups.size(); i++) {
962 (*(filehandles[Groups[i]])).close();
963 delete filehandles[Groups[i]];
970 catch(exception& e) {
971 m->errorOut(e, "GetOTURepCommand", "process");
975 //**********************************************************************************************************************
976 int GetOTURepCommand::processFastaNames(string filename, string label) {
980 if (outputDir == "") { outputDir += m->hasPath(listfile); }
981 string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + label + "." + getOutputFileNameTag("fasta");
982 m->openOutputFile(outputFileName, out);
983 vector<repStruct> reps;
984 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
987 string tempNameFile = filename + ".temp";
988 m->openOutputFile(tempNameFile, out2);
991 m->openInputFile(filename, in);
994 string tempGroup = "";
995 in >> tempGroup; m->gobble(in);
998 if (countfile != "") {
999 thisCt.readTable(countfile);
1000 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1005 string rep, binnames;
1006 in >> i >> rep >> binnames; m->gobble(in);
1008 vector<string> names;
1009 m->splitAtComma(binnames, names);
1010 int binsize = names.size();
1012 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1014 if (tempGroup == "noGroup") {
1015 for (int j = 0; j < names.size(); j++) {
1016 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1018 binsize = thisCt.getNumSeqs(rep);
1021 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1022 out2 << rep << '\t' << total << '\t' << total << endl;
1026 thistotal += binsize;
1027 //if you have a groupfile
1029 map<string, string> groups;
1030 map<string, string>::iterator groupIt;
1031 if (groupfile != "") {
1032 //find the groups that are in this bin
1033 for (int i = 0; i < names.size(); i++) {
1034 string groupName = groupMap->getGroup(names[i]);
1035 if (groupName == "not found") {
1036 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
1039 groups[groupName] = groupName;
1043 //turn the groups into a string
1044 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1045 group += groupIt->first + "-";
1048 group = group.substr(0, group.length()-1);
1049 }else if (hasGroups) {
1050 map<string, string> groups;
1051 for (int i = 0; i < names.size(); i++) {
1052 vector<string> thisSeqsGroups = ct.getGroups(names[i]);
1053 for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; }
1055 //turn the groups into a string
1056 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1057 group += groupIt->first + "-";
1060 group = group.substr(0, group.length()-1);
1061 //cout << group << endl;
1066 //print out name and sequence for that bin
1067 string sequence = fasta->getSequence(rep);
1069 if (sequence != "not found") {
1070 if (sorted == "") { //print them out
1071 rep = rep + "\t" + toString(i+1);
1072 rep = rep + "|" + toString(binsize);
1074 rep = rep + "|" + group;
1076 out << ">" << rep << endl;
1077 out << sequence << endl;
1079 repStruct newRep(rep, i+1, binsize, group);
1080 reps.push_back(newRep);
1083 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
1088 if (sorted != "") { //then sort them and print them
1089 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
1090 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
1091 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
1092 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
1095 for (int i = 0; i < reps.size(); i++) {
1096 string sequence = fasta->getSequence(reps[i].name);
1097 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
1098 outputName = outputName + "|" + toString(reps[i].size);
1099 if (reps[i].group != "") {
1100 outputName = outputName + "|" + reps[i].group;
1102 out << ">" << outputName << endl;
1103 out << sequence << endl;
1111 m->mothurRemove(filename);
1112 rename(tempNameFile.c_str(), filename.c_str());
1114 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1119 catch(exception& e) {
1120 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
1124 //**********************************************************************************************************************
1125 int GetOTURepCommand::processNames(string filename, string label) {
1128 //create output file
1129 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1132 string tempNameFile = filename + ".temp";
1133 m->openOutputFile(tempNameFile, out2);
1136 m->openInputFile(filename, in);
1139 string rep, binnames;
1141 string tempGroup = "";
1142 in >> tempGroup; m->gobble(in);
1145 if (countfile != "") {
1146 thisCt.readTable(countfile);
1147 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1151 if (m->control_pressed) { break; }
1152 in >> i >> rep >> binnames; m->gobble(in);
1154 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1156 vector<string> names;
1157 m->splitAtComma(binnames, names);
1158 if (tempGroup == "noGroup") {
1159 for (int j = 0; j < names.size(); j++) {
1160 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1164 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1165 out2 << rep << '\t' << total << '\t' << total << endl;
1173 m->mothurRemove(filename);
1174 rename(tempNameFile.c_str(), filename.c_str());
1176 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1180 catch(exception& e) {
1181 m->errorOut(e, "GetOTURepCommand", "processNames");
1185 //**********************************************************************************************************************
1186 SeqMap GetOTURepCommand::getMap(int row) {
1190 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1191 if (rowPositions[row] != -1){
1193 inRow.seekg(rowPositions[row]);
1195 int rowNum, numDists, colNum;
1198 inRow >> rowNum >> numDists;
1200 for(int i = 0; i < numDists; i++) {
1201 inRow >> colNum >> dist;
1202 rowMap[colNum] = dist;
1209 catch(exception& e) {
1210 m->errorOut(e, "GetOTURepCommand", "getMap");
1214 //**********************************************************************************************************************