5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","name",false,true, true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,false, true); parameters.push_back(pfasta);
44 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none","",false,false, true); parameters.push_back(pphylip);
45 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","",false,false, true); parameters.push_back(pname);
46 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName","count",false,false, true); parameters.push_back(pcount);
47 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false, true); parameters.push_back(pgroup);
48 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName","",false,false, true); parameters.push_back(pcolumn);
49 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
50 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
51 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pcutoff);
52 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
53 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pweighted);
54 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "","",false,false); parameters.push_back(psorted);
55 CommandParameter pmethod("method", "Multiple", "distance-abundance", "distance", "", "", "","",false,false); parameters.push_back(pmethod);
56 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
57 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
58 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
60 vector<string> myArray;
61 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
65 m->errorOut(e, "GetOTURepCommand", "setParameters");
69 //**********************************************************************************************************************
70 string GetOTURepCommand::getHelpString(){
72 string helpString = "";
73 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted, method and label. The list parameter is required, as well as phylip or column and name if you are using method=distance. If method=abundance a name or count file is required.\n";
74 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
75 helpString += "The phylip or column parameter is required for method=distance, but only one may be used. If you use a column file the name or count filename is required. \n";
76 helpString += "The method parameter allows you to select the method of selecting the representative sequence. Choices are distance and abundance. The distance method finds the sequence with the smallest maximum distance to the other sequences. If tie occurs the sequence with smallest average distance is selected. The abundance method chooses the most abundant sequence in the OTU as the representative.\n";
77 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
78 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
79 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
80 helpString += "The default value for label is all labels in your inputfile.\n";
81 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
82 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
83 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
84 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
85 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
86 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
87 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
88 helpString += "The group parameter allows you provide a group file.\n";
89 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
90 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
91 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
92 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
96 m->errorOut(e, "GetOTURepCommand", "getHelpString");
100 //**********************************************************************************************************************
101 string GetOTURepCommand::getOutputPattern(string type) {
105 if (type == "fasta") { pattern = "[filename],[tag],rep.fasta-[filename],[tag],[group],rep.fasta"; }
106 else if (type == "name") { pattern = "[filename],[tag],rep.names-[filename],[tag],[group],rep.names"; }
107 else if (type == "count") { pattern = "[filename],[tag],rep.count_table-[filename],[tag],[group],rep.count_table"; }
108 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
112 catch(exception& e) {
113 m->errorOut(e, "GetOTURepCommand", "getOutputPattern");
117 //**********************************************************************************************************************
118 GetOTURepCommand::GetOTURepCommand(){
120 abort = true; calledHelp = true;
122 vector<string> tempOutNames;
123 outputTypes["fasta"] = tempOutNames;
124 outputTypes["name"] = tempOutNames;
125 outputTypes["count"] = tempOutNames;
127 catch(exception& e) {
128 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
132 //**********************************************************************************************************************
133 GetOTURepCommand::GetOTURepCommand(string option) {
135 abort = false; calledHelp = false;
138 //allow user to run help
139 if (option == "help") {
140 help(); abort = true; calledHelp = true;
141 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
143 vector<string> myArray = setParameters();
145 OptionParser parser(option);
146 map<string, string> parameters = parser.getParameters();
148 ValidParameters validParameter;
149 map<string, string>::iterator it;
151 //check to make sure all parameters are valid for command
152 for (it = parameters.begin(); it != parameters.end(); it++) {
153 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
156 //initialize outputTypes
157 vector<string> tempOutNames;
158 outputTypes["fasta"] = tempOutNames;
159 outputTypes["name"] = tempOutNames;
160 outputTypes["count"] = tempOutNames;
162 //if the user changes the input directory command factory will send this info to us in the output parameter
163 string inputDir = validParameter.validFile(parameters, "inputdir", false);
164 if (inputDir == "not found"){ inputDir = ""; }
167 it = parameters.find("list");
168 //user has given a template file
169 if(it != parameters.end()){
170 path = m->hasPath(it->second);
171 //if the user has not given a path then, add inputdir. else leave path alone.
172 if (path == "") { parameters["list"] = inputDir + it->second; }
175 it = parameters.find("fasta");
176 //user has given a template file
177 if(it != parameters.end()){
178 path = m->hasPath(it->second);
179 //if the user has not given a path then, add inputdir. else leave path alone.
180 if (path == "") { parameters["fasta"] = inputDir + it->second; }
183 it = parameters.find("phylip");
184 //user has given a template file
185 if(it != parameters.end()){
186 path = m->hasPath(it->second);
187 //if the user has not given a path then, add inputdir. else leave path alone.
188 if (path == "") { parameters["phylip"] = inputDir + it->second; }
191 it = parameters.find("column");
192 //user has given a template file
193 if(it != parameters.end()){
194 path = m->hasPath(it->second);
195 //if the user has not given a path then, add inputdir. else leave path alone.
196 if (path == "") { parameters["column"] = inputDir + it->second; }
199 it = parameters.find("name");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["name"] = inputDir + it->second; }
207 it = parameters.find("group");
208 //user has given a template file
209 if(it != parameters.end()){
210 path = m->hasPath(it->second);
211 //if the user has not given a path then, add inputdir. else leave path alone.
212 if (path == "") { parameters["group"] = inputDir + it->second; }
215 it = parameters.find("count");
216 //user has given a template file
217 if(it != parameters.end()){
218 path = m->hasPath(it->second);
219 //if the user has not given a path then, add inputdir. else leave path alone.
220 if (path == "") { parameters["count"] = inputDir + it->second; }
225 //if the user changes the output directory command factory will send this info to us in the output parameter
226 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
228 //check for required parameters
229 fastafile = validParameter.validFile(parameters, "fasta", true);
230 if (fastafile == "not found") { fastafile = ""; }
231 else if (fastafile == "not open") { abort = true; }
232 else { m->setFastaFile(fastafile); }
234 listfile = validParameter.validFile(parameters, "list", true);
235 if (listfile == "not found") {
236 listfile = m->getListFile();
237 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
238 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
240 else if (listfile == "not open") { abort = true; }
241 else { m->setListFile(listfile); }
243 phylipfile = validParameter.validFile(parameters, "phylip", true);
244 if (phylipfile == "not found") { phylipfile = ""; }
245 else if (phylipfile == "not open") { abort = true; }
246 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
248 columnfile = validParameter.validFile(parameters, "column", true);
249 if (columnfile == "not found") { columnfile = ""; }
250 else if (columnfile == "not open") { abort = true; }
251 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
253 namefile = validParameter.validFile(parameters, "name", true);
254 if (namefile == "not open") { abort = true; }
255 else if (namefile == "not found") { namefile = ""; }
256 else { m->setNameFile(namefile); }
259 countfile = validParameter.validFile(parameters, "count", true);
260 if (countfile == "not found") { countfile = ""; }
261 else if (countfile == "not open") { abort = true; countfile = ""; }
263 m->setCountTableFile(countfile);
264 ct.readTable(countfile, true);
265 if (ct.hasGroupInfo()) { hasGroups = true; }
268 groupfile = validParameter.validFile(parameters, "group", true);
269 if (groupfile == "not open") { groupfile = ""; abort = true; }
270 else if (groupfile == "not found") { groupfile = ""; }
271 else { m->setGroupFile(groupfile); }
273 method = validParameter.validFile(parameters, "method", false); if (method == "not found"){ method = "distance"; }
274 if ((method != "distance") && (method != "abundance")) {
275 m->mothurOut(method + " is not a valid option for the method parameter. The only options are: distance and abundance, aborting."); m->mothurOutEndLine(); abort = true;
278 if (method == "distance") {
279 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
280 //give priority to column, then phylip
281 columnfile = m->getColumnFile();
282 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
284 phylipfile = m->getPhylipFile();
285 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
287 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
291 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
293 if (columnfile != "") {
294 if ((namefile == "") && (countfile == "")) {
295 namefile = m->getNameFile();
296 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
298 countfile = m->getCountTableFile();
299 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
301 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
307 }else if (method == "abundance") {
308 if ((namefile == "") && (countfile == "")) {
309 namefile = m->getNameFile();
310 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
312 countfile = m->getCountTableFile();
313 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
315 m->mothurOut("You need to provide a namefile or countfile if you are going to use the abundance method."); m->mothurOutEndLine();
320 if ((phylipfile != "") || (columnfile != "")) {
321 m->mothurOut("[WARNING]: A phylip or column file is not needed to use the abundance method, ignoring."); m->mothurOutEndLine();
322 phylipfile = ""; columnfile = "";
326 if ((namefile != "") && (countfile != "")) {
327 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
330 if ((groupfile != "") && (countfile != "")) {
331 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
335 //check for optional parameter and set defaults
336 // ...at some point should added some additional type checking...
337 label = validParameter.validFile(parameters, "label", false);
338 if (label == "not found") { label = ""; allLines = 1; }
340 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
341 else { allLines = 1; }
345 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
346 if (sorted == "none") { sorted=""; }
347 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
348 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
354 if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) {
355 m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine();
359 groups = validParameter.validFile(parameters, "groups", false);
360 if (groups == "not found") { groups = ""; }
362 if ((groupfile == "") && (!hasGroups)) {
363 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
366 m->splitAtDash(groups, Groups);
369 m->setGroups(Groups);
371 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
372 large = m->isTrue(temp);
374 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
375 weighted = m->isTrue(temp);
377 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
379 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
380 m->mothurConvert(temp, precision);
382 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
383 m->mothurConvert(temp, cutoff);
384 cutoff += (5 / (precision * 10.0));
387 catch(exception& e) {
388 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
393 //**********************************************************************************************************************
395 int GetOTURepCommand::execute(){
398 if (abort == true) { if (calledHelp) { return 0; } return 2; }
402 if (method=="distance") {
404 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
406 //map name -> abundance for use if findRepAbund
407 if (namefile != "") { nameToIndex = m->readNames(namefile); }
410 if (m->control_pressed) { if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } }return 0; }
412 if (groupfile != "") {
413 //read in group map info.
414 groupMap = new GroupMap(groupfile);
415 int error = groupMap->readMap();
416 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
418 if (Groups.size() != 0) {
420 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
421 util.setGroups(Groups, gNamesOfGroups, "getoturep");
422 groupMap->setNamesOfGroups(gNamesOfGroups);
424 }else if (hasGroups) {
425 if (Groups.size() != 0) {
427 vector<string> gNamesOfGroups = ct.getNamesOfGroups();
428 util.setGroups(Groups, gNamesOfGroups, "getoturep");
432 //done with listvector from matrix
433 if (list != NULL) { delete list; }
435 InputData input(listfile, "list");
436 list = input.getListVector();
437 string lastLabel = list->getLabel();
439 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
440 set<string> processedLabels;
441 set<string> userLabels = labels;
443 if (m->control_pressed) { if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } } delete list; return 0; }
445 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
447 if (allLines == 1 || labels.count(list->getLabel()) == 1){
448 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
449 error = process(list);
450 if (error == 1) { return 0; } //there is an error in hte input files, abort command
452 if (m->control_pressed) {
453 if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } }
454 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
455 delete list; return 0;
458 processedLabels.insert(list->getLabel());
459 userLabels.erase(list->getLabel());
462 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
463 string saveLabel = list->getLabel();
466 list = input.getListVector(lastLabel);
467 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
468 error = process(list);
469 if (error == 1) { return 0; } //there is an error in hte input files, abort command
471 if (m->control_pressed) {
472 if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } }
473 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
474 delete list; return 0;
477 processedLabels.insert(list->getLabel());
478 userLabels.erase(list->getLabel());
480 //restore real lastlabel to save below
481 list->setLabel(saveLabel);
484 lastLabel = list->getLabel();
487 list = input.getListVector();
490 //output error messages about any remaining user labels
491 bool needToRun = false;
492 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
493 m->mothurOut("Your file does not include the label " + (*it));
494 if (processedLabels.count(lastLabel) != 1) {
495 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
498 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
502 //run last label if you need to
503 if (needToRun == true) {
504 if (list != NULL) { delete list; }
505 list = input.getListVector(lastLabel);
506 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
507 error = process(list);
509 if (error == 1) { return 0; } //there is an error in hte input files, abort command
511 if (m->control_pressed) {
512 if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } }
513 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
514 delete list; return 0;
518 //close and remove formatted matrix file
519 if (method=="distance") { if (large) { inRow.close(); m->mothurRemove(distFile); } if (!weighted) { nameFileMap.clear(); } }
521 if (fastafile != "") {
523 FastaMap* fasta = new FastaMap();
524 fasta->readFastaFile(fastafile);
526 //if user gave a namesfile then use it
527 if (namefile != "") { readNamesFile(fasta); }
529 //output create and output the .rep.fasta files
530 map<string, string>::iterator itNameFile;
531 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
532 processFastaNames(itNameFile->first, itNameFile->second, fasta);
536 //output create and output the .rep.fasta files
537 map<string, string>::iterator itNameFile;
538 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
539 processNames(itNameFile->first, itNameFile->second);
544 if (groupfile != "") { delete groupMap; }
546 if (m->control_pressed) { return 0; }
548 //set fasta file as new current fastafile - use first one??
550 itTypes = outputTypes.find("fasta");
551 if (itTypes != outputTypes.end()) {
552 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
555 itTypes = outputTypes.find("name");
556 if (itTypes != outputTypes.end()) {
557 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
560 itTypes = outputTypes.find("count");
561 if (itTypes != outputTypes.end()) {
562 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
565 m->mothurOutEndLine();
566 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
567 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
568 m->mothurOutEndLine();
572 catch(exception& e) {
573 m->errorOut(e, "GetOTURepCommand", "execute");
577 //**********************************************************************************************************************
578 int GetOTURepCommand::readDist() {
582 //read distance files
583 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
584 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
585 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
587 readMatrix->setCutoff(cutoff);
589 NameAssignment* nameMap = NULL;
591 nameMap = new NameAssignment(namefile);
593 readMatrix->read(nameMap);
594 }else if (countfile != "") {
595 readMatrix->read(&ct);
597 readMatrix->read(nameMap);
600 if (m->control_pressed) { delete readMatrix; return 0; }
602 list = readMatrix->getListVector();
603 SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
605 // Create a data structure to quickly access the distance information.
606 // It consists of a vector of distance maps, where each map contains
607 // all distances of a certain sequence. Vector and maps are accessed
608 // via the index of a sequence in the distance matrix
609 seqVec = vector<SeqMap>(list->size());
610 for (int i = 0; i < matrix->seqVec.size(); i++) {
611 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
612 if (m->control_pressed) { delete readMatrix; return 0; }
613 //already added everyone else in row
614 if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
617 //add dummy map for unweighted calc
619 seqVec.push_back(dummy);
625 if (m->control_pressed) { return 0; }
627 //process file and set up indexes
628 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
629 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
630 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
632 formatMatrix->setCutoff(cutoff);
634 NameAssignment* nameMap = NULL;
636 nameMap = new NameAssignment(namefile);
638 formatMatrix->read(nameMap);
639 }else if (countfile != "") {
640 formatMatrix->read(&ct);
642 formatMatrix->read(nameMap);
645 if (m->control_pressed) { delete formatMatrix; return 0; }
647 list = formatMatrix->getListVector();
648 distFile = formatMatrix->getFormattedFileName();
650 //positions in file where the distances for each sequence begin
651 //rowPositions[1] = position in file where distance related to sequence 1 start.
652 rowPositions = formatMatrix->getRowPositions();
653 rowPositions.push_back(-1); //dummy row for unweighted calc
658 //openfile for getMap to use
659 m->openInputFile(distFile, inRow);
661 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
665 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
667 vector<string> names;
669 //map names to rows in sparsematrix
670 for (int i = 0; i < list->size(); i++) {
672 binnames = list->get(i);
674 m->splitAtComma(binnames, names);
676 for (int j = 0; j < names.size(); j++) {
677 nameToIndex[names[j]] = i;
680 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
682 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; }
686 catch(exception& e) {
687 m->errorOut(e, "GetOTURepCommand", "readDist");
691 //**********************************************************************************************************************
692 void GetOTURepCommand::readNamesFile(FastaMap*& fasta) {
695 vector<string> dupNames;
696 m->openInputFile(namefile, in);
698 string name, names, sequence;
701 in >> name; //read from first column A
702 in >> names; //read from second column A,B,C,D
706 //parse names into vector
707 m->splitAtComma(names, dupNames);
709 //store names in fasta map
710 sequence = fasta->getSequence(name);
711 for (int i = 0; i < dupNames.size(); i++) {
712 fasta->push_back(dupNames[i], sequence);
720 catch(exception& e) {
721 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
725 //**********************************************************************************************************************
726 //read names file to find the weighted rep for each bin
727 void GetOTURepCommand::readNamesFile(bool w) {
730 vector<string> dupNames;
731 m->openInputFile(namefile, in);
733 string name, names, sequence;
736 in >> name; m->gobble(in); //read from first column A
737 in >> names; //read from second column A,B,C,D
741 //parse names into vector
742 m->splitAtComma(names, dupNames);
744 for (int i = 0; i < dupNames.size(); i++) {
745 nameFileMap[dupNames[i]] = name;
753 catch(exception& e) {
754 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
758 //**********************************************************************************************************************
759 string GetOTURepCommand::findRepAbund(vector<string> names, string group) {
762 string rep = "notFound";
764 if ((names.size() == 1)) {
767 //fill seqIndex and initialize sums
769 for (int i = 0; i < names.size(); i++) {
771 if (m->control_pressed) { return "control"; }
773 if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
775 if (group != "") { numRep = ct.getGroupCount(names[i], group); }
776 else { numRep = ct.getGroupCount(names[i]); }
777 if (numRep > maxAbund) {
779 reps.push_back(names[i]);
781 }else if(numRep == maxAbund) { //tie
782 reps.push_back(names[i]);
784 }else { //name file used, we assume list file contains all sequences
785 map<string, int>::iterator itNameMap = nameToIndex.find(names[i]);
786 if (itNameMap == nameToIndex.end()) {} //assume that this sequence is not a unique
788 if (itNameMap->second > maxAbund) {
790 reps.push_back(names[i]);
791 maxAbund = itNameMap->second;
792 }else if(itNameMap->second == maxAbund) { //tie
793 reps.push_back(names[i]);
799 if (reps.size() == 0) { m->mothurOut("[ERROR]: no rep found, file mismatch?? Quitting.\n"); m->control_pressed = true; }
800 else if (reps.size() == 1) { rep = reps[0]; }
802 int index = m->getRandomIndex(reps.size()-1);
809 catch(exception& e) {
810 m->errorOut(e, "GetOTURepCommand", "findRepAbund");
814 //**********************************************************************************************************************
815 string GetOTURepCommand::findRep(vector<string> names, string group) {
818 if (method == "abundance") { return (findRepAbund(names, group)); }
819 else { //find rep based on distance
821 // if only 1 sequence in bin or processing the "unique" label, then
822 // the first sequence of the OTU is the representative one
823 if ((names.size() == 1)) {
826 vector<int> seqIndex; //(names.size());
827 map<string, string>::iterator itNameFile;
828 map<string, int>::iterator itNameIndex;
830 //fill seqIndex and initialize sums
831 for (size_t i = 0; i < names.size(); i++) {
833 seqIndex.push_back(nameToIndex[names[i]]);
834 if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
836 if (group != "") { numRep = ct.getGroupCount(names[i], group); }
837 else { numRep = ct.getGroupCount(names[i]); }
838 for (int j = 1; j < numRep; j++) { //don't add yourself again
839 seqIndex.push_back(nameToIndex[names[i]]);
843 if (namefile == "") {
844 itNameIndex = nameToIndex.find(names[i]);
846 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
847 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
848 else { seqIndex.push_back((seqVec.size()-1)); }
850 seqIndex.push_back(itNameIndex->second);
854 itNameFile = nameFileMap.find(names[i]);
856 if (itNameFile == nameFileMap.end()) {
857 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
859 string name1 = itNameFile->first;
860 string name2 = itNameFile->second;
862 if (name1 == name2) { //then you are unique so add your real dists
863 seqIndex.push_back(nameToIndex[names[i]]);
865 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
866 else { seqIndex.push_back((seqVec.size()-1)); }
873 vector<float> max_dist(seqIndex.size(), 0.0);
874 vector<float> total_dist(seqIndex.size(), 0.0);
876 // loop through all entries in seqIndex
879 for (size_t i=0; i < seqIndex.size(); i++) {
880 if (m->control_pressed) { return "control"; }
882 if (!large) { currMap = seqVec[seqIndex[i]]; }
883 else { currMap = getMap(seqIndex[i]); }
885 for (size_t j=0; j < seqIndex.size(); j++) {
886 it = currMap.find(seqIndex[j]);
887 if (it != currMap.end()) {
888 max_dist[i] = max(max_dist[i], it->second);
889 max_dist[j] = max(max_dist[j], it->second);
890 total_dist[i] += it->second;
891 total_dist[j] += it->second;
892 }else{ //if you can't find the distance make it the cutoff
893 max_dist[i] = max(max_dist[i], cutoff);
894 max_dist[j] = max(max_dist[j], cutoff);
895 total_dist[i] += cutoff;
896 total_dist[j] += cutoff;
901 // sequence with the smallest maximum distance is the representative
902 //if tie occurs pick sequence with smallest average distance
905 for (size_t i=0; i < max_dist.size(); i++) {
906 if (m->control_pressed) { return "control"; }
907 if (max_dist[i] < min) {
910 }else if (max_dist[i] == min) {
911 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
912 float newAverage = total_dist[i] / (float) total_dist.size();
914 if (newAverage < currentAverage) {
921 return(names[minIndex]);
925 catch(exception& e) {
926 m->errorOut(e, "GetOTURepCommand", "FindRep");
931 //**********************************************************************************************************************
932 int GetOTURepCommand::process(ListVector* processList) {
934 string name, sequence;
938 if (outputDir == "") { outputDir += m->hasPath(listfile); }
940 ofstream newNamesOutput;
941 string outputNamesFile;
942 map<string, ofstream*> filehandles;
944 map<string, string> variables;
945 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
947 if (Groups.size() == 0) { //you don't want to use groups
948 variables["[tag]"] = processList->getLabel();
949 if (countfile == "") {
950 outputNamesFile = getOutputFileName("name", variables);
951 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
953 outputNamesFile = getOutputFileName("count", variables);
954 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
956 outputNameFiles[outputNamesFile] = processList->getLabel();
957 m->openOutputFile(outputNamesFile, newNamesOutput);
958 newNamesOutput << "noGroup" << endl;
959 }else{ //you want to use groups
961 for (int i=0; i<Groups.size(); i++) {
963 variables["[tag]"] = processList->getLabel();
964 variables["[group]"] = Groups[i];
965 filehandles[Groups[i]] = temp;
966 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
967 if (countfile == "") {
968 outputNamesFile = getOutputFileName("name", variables);
969 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
971 outputNamesFile = getOutputFileName("count", variables);
972 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
975 m->openOutputFile(outputNamesFile, *(temp));
976 *(temp) << Groups[i] << endl;
977 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
981 //for each bin in the list vector
982 for (int i = 0; i < processList->size(); i++) {
983 if (m->control_pressed) {
985 if (Groups.size() == 0) { //you don't want to use groups
986 newNamesOutput.close();
988 for (int j=0; j<Groups.size(); j++) {
989 (*(filehandles[Groups[j]])).close();
990 delete filehandles[Groups[j]];
996 string temp = processList->get(i);
997 vector<string> namesInBin;
998 m->splitAtComma(temp, namesInBin);
1000 if (Groups.size() == 0) {
1001 nameRep = findRep(namesInBin, "");
1002 newNamesOutput << i << '\t' << nameRep << '\t';
1004 //put rep at first position in names line
1005 string outputString = nameRep + ",";
1006 for (int k=0; k<namesInBin.size()-1; k++) {//output list of names in this otu
1007 if (namesInBin[k] != nameRep) { outputString += namesInBin[k] + ","; }
1011 if (namesInBin[namesInBin.size()-1] != nameRep) { outputString += namesInBin[namesInBin.size()-1]; }
1013 if (outputString[outputString.length()-1] == ',') { //rip off comma
1014 outputString = outputString.substr(0, outputString.length()-1);
1016 newNamesOutput << outputString << endl;
1018 map<string, vector<string> > NamesInGroup;
1019 for (int j=0; j<Groups.size(); j++) { //initialize groups
1020 NamesInGroup[Groups[j]].resize(0);
1023 for (int j=0; j<namesInBin.size(); j++) {
1024 if (groupfile != "") {
1025 string thisgroup = groupMap->getGroup(namesInBin[j]);
1026 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
1028 //add this name to correct group
1029 if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); }
1031 vector<string> thisSeqsGroups = ct.getGroups(namesInBin[j]);
1032 for (int k = 0; k < thisSeqsGroups.size(); k++) {
1033 if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); }
1038 //get rep for each group in otu
1039 for (int j=0; j<Groups.size(); j++) {
1040 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
1041 //get rep for each group
1042 nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]);
1044 //output group rep and other members of this group
1045 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
1047 //put rep at first position in names line
1048 string outputString = nameRep + ",";
1049 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
1050 if (NamesInGroup[Groups[j]][k] != nameRep) { outputString += NamesInGroup[Groups[j]][k] + ","; }
1054 if (NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] != nameRep) { outputString += NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1]; }
1056 if (outputString[outputString.length()-1] == ',') { //rip off comma
1057 outputString = outputString.substr(0, outputString.length()-1);
1059 (*(filehandles[Groups[j]])) << outputString << endl;
1065 if (Groups.size() == 0) { //you don't want to use groups
1066 newNamesOutput.close();
1068 for (int i=0; i<Groups.size(); i++) {
1069 (*(filehandles[Groups[i]])).close();
1070 delete filehandles[Groups[i]];
1077 catch(exception& e) {
1078 m->errorOut(e, "GetOTURepCommand", "process");
1082 //**********************************************************************************************************************
1083 int GetOTURepCommand::processFastaNames(string filename, string label, FastaMap*& fasta) {
1086 //create output file
1087 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1088 map<string, string> variables;
1089 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
1090 variables["[tag]"] = label;
1091 string outputFileName = getOutputFileName("fasta",variables);
1092 m->openOutputFile(outputFileName, out);
1093 vector<repStruct> reps;
1094 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
1097 string tempNameFile = filename + ".temp";
1098 m->openOutputFile(tempNameFile, out2);
1101 m->openInputFile(filename, in);
1104 string tempGroup = "";
1105 in >> tempGroup; m->gobble(in);
1108 if (countfile != "") {
1109 thisCt.readTable(countfile, true);
1110 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1115 string rep, binnames;
1116 in >> i >> rep >> binnames; m->gobble(in);
1118 vector<string> names;
1119 m->splitAtComma(binnames, names);
1120 int binsize = names.size();
1122 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1124 if (tempGroup == "noGroup") {
1125 for (int j = 0; j < names.size(); j++) {
1126 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1128 binsize = thisCt.getNumSeqs(rep);
1131 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1132 out2 << rep << '\t' << total << '\t' << total << endl;
1136 thistotal += binsize;
1137 //if you have a groupfile
1139 map<string, string> groups;
1140 map<string, string>::iterator groupIt;
1141 if (groupfile != "") {
1142 //find the groups that are in this bin
1143 for (int i = 0; i < names.size(); i++) {
1144 string groupName = groupMap->getGroup(names[i]);
1145 if (groupName == "not found") {
1146 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
1149 groups[groupName] = groupName;
1153 //turn the groups into a string
1154 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1155 group += groupIt->first + "-";
1158 group = group.substr(0, group.length()-1);
1159 }else if (hasGroups) {
1160 map<string, string> groups;
1161 for (int i = 0; i < names.size(); i++) {
1162 vector<string> thisSeqsGroups = ct.getGroups(names[i]);
1163 for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; }
1165 //turn the groups into a string
1166 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1167 group += groupIt->first + "-";
1170 group = group.substr(0, group.length()-1);
1171 //cout << group << endl;
1176 //print out name and sequence for that bin
1177 string sequence = fasta->getSequence(rep);
1179 if (sequence != "not found") {
1180 if (sorted == "") { //print them out
1181 rep = rep + "\t" + toString(i+1);
1182 rep = rep + "|" + toString(binsize);
1184 rep = rep + "|" + group;
1186 out << ">" << rep << endl;
1187 out << sequence << endl;
1189 repStruct newRep(rep, i+1, binsize, group);
1190 reps.push_back(newRep);
1193 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
1198 if (sorted != "") { //then sort them and print them
1199 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
1200 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
1201 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
1202 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
1205 for (int i = 0; i < reps.size(); i++) {
1206 string sequence = fasta->getSequence(reps[i].name);
1207 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
1208 outputName = outputName + "|" + toString(reps[i].size);
1209 if (reps[i].group != "") {
1210 outputName = outputName + "|" + reps[i].group;
1212 out << ">" << outputName << endl;
1213 out << sequence << endl;
1221 m->mothurRemove(filename);
1222 rename(tempNameFile.c_str(), filename.c_str());
1224 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1229 catch(exception& e) {
1230 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
1234 //**********************************************************************************************************************
1235 int GetOTURepCommand::processNames(string filename, string label) {
1238 //create output file
1239 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1242 string tempNameFile = filename + ".temp";
1243 m->openOutputFile(tempNameFile, out2);
1246 m->openInputFile(filename, in);
1249 string rep, binnames;
1251 string tempGroup = "";
1252 in >> tempGroup; m->gobble(in);
1255 if (countfile != "") {
1256 thisCt.readTable(countfile, true);
1257 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1261 if (m->control_pressed) { break; }
1262 in >> i >> rep >> binnames; m->gobble(in);
1264 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1266 vector<string> names;
1267 m->splitAtComma(binnames, names);
1268 if (tempGroup == "noGroup") {
1269 for (int j = 0; j < names.size(); j++) {
1270 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1274 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1275 out2 << rep << '\t' << total << '\t' << total << endl;
1283 m->mothurRemove(filename);
1284 rename(tempNameFile.c_str(), filename.c_str());
1286 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1290 catch(exception& e) {
1291 m->errorOut(e, "GetOTURepCommand", "processNames");
1295 //**********************************************************************************************************************
1296 SeqMap GetOTURepCommand::getMap(int row) {
1300 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1301 if (rowPositions[row] != -1){
1303 inRow.seekg(rowPositions[row]);
1305 int rowNum, numDists, colNum;
1308 inRow >> rowNum >> numDists;
1310 for(int i = 0; i < numDists; i++) {
1311 inRow >> colNum >> dist;
1312 rowMap[colNum] = dist;
1319 catch(exception& e) {
1320 m->errorOut(e, "GetOTURepCommand", "getMap");
1324 //**********************************************************************************************************************