5 * Created by Sarah Westcott on 4/6/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "getoturepcommand.h"
11 #include "readphylip.h"
12 #include "readcolumn.h"
13 #include "formatphylip.h"
14 #include "formatcolumn.h"
15 #include "sharedutilities.h"
18 //********************************************************************************************************************
19 //sorts lowest to highest
20 inline bool compareName(repStruct left, repStruct right){
21 return (left.name < right.name);
23 //********************************************************************************************************************
24 //sorts lowest to highest
25 inline bool compareBin(repStruct left, repStruct right){
26 return (left.bin < right.bin);
28 //********************************************************************************************************************
29 //sorts lowest to highest
30 inline bool compareSize(repStruct left, repStruct right){
31 return (left.size < right.size);
33 //********************************************************************************************************************
34 //sorts lowest to highest
35 inline bool compareGroup(repStruct left, repStruct right){
36 return (left.group < right.group);
39 //**********************************************************************************************************************
40 vector<string> GetOTURepCommand::setParameters(){
42 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none","name",false,true, true); parameters.push_back(plist);
43 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,false, true); parameters.push_back(pfasta);
44 CommandParameter pphylip("phylip", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "none","",false,false, true); parameters.push_back(pphylip);
45 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "ColumnName","",false,false, true); parameters.push_back(pname);
46 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "ColumnName","count",false,false, true); parameters.push_back(pcount);
47 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false, true); parameters.push_back(pgroup);
48 CommandParameter pcolumn("column", "InputTypes", "", "", "PhylipColumn", "PhylipColumn", "ColumnName","",false,false, true); parameters.push_back(pcolumn);
49 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
50 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
51 CommandParameter pcutoff("cutoff", "Number", "", "10", "", "", "","",false,false); parameters.push_back(pcutoff);
52 CommandParameter pprecision("precision", "Number", "", "100", "", "", "","",false,false); parameters.push_back(pprecision);
53 CommandParameter pweighted("weighted", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pweighted);
54 CommandParameter psorted("sorted", "Multiple", "none-name-bin-size-group", "none", "", "", "","",false,false); parameters.push_back(psorted);
55 CommandParameter plarge("large", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(plarge);
56 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
57 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
59 vector<string> myArray;
60 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
64 m->errorOut(e, "GetOTURepCommand", "setParameters");
68 //**********************************************************************************************************************
69 string GetOTURepCommand::getHelpString(){
71 string helpString = "";
72 helpString += "The get.oturep command parameters are phylip, column, list, fasta, name, group, count, large, weighted, cutoff, precision, groups, sorted and label. The list parameter is required, as well as phylip or column and name, unless you have valid current files.\n";
73 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and is separated by dashes.\n";
74 helpString += "The phylip or column parameter is required, but only one may be used. If you use a column file the name or count filename is required. \n";
75 helpString += "If you do not provide a cutoff value 10.00 is assumed. If you do not provide a precision value then 100 is assumed.\n";
76 helpString += "The get.oturep command should be in the following format: get.oturep(phylip=yourDistanceMatrix, fasta=yourFastaFile, list=yourListFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
77 helpString += "Example get.oturep(phylip=amazon.dist, fasta=amazon.fasta, list=amazon.fn.list, group=amazon.groups).\n";
78 helpString += "The default value for label is all labels in your inputfile.\n";
79 helpString += "The sorted parameter allows you to indicate you want the output sorted. You can sort by sequence name, bin number, bin size or group. The default is no sorting, but your options are name, number, size, or group.\n";
80 helpString += "The large parameter allows you to indicate that your distance matrix is too large to fit in RAM. The default value is false.\n";
81 helpString += "The weighted parameter allows you to indicate that want to find the weighted representative. You must provide a namesfile to set weighted to true. The default value is false.\n";
82 helpString += "The representative is found by selecting the sequence that has the smallest total distance to all other sequences in the OTU. If a tie occurs the smallest average distance is used.\n";
83 helpString += "For weighted = false, mothur assumes the distance file contains only unique sequences, the list file may contain all sequences, but only the uniques are considered to become the representative. If your distance file contains all the sequences it would become weighted=true.\n";
84 helpString += "For weighted = true, mothur assumes the distance file contains only unique sequences, the list file must contain all sequences, all sequences are considered to become the representative, but unique name will be used in the output for consistency.\n";
85 helpString += "If your distance file contains all the sequence and you do not provide a name file, the weighted representative will be given, unless your listfile is unique. If you provide a namefile, then you can select weighted or unweighted.\n";
86 helpString += "The group parameter allows you provide a group file.\n";
87 helpString += "The groups parameter allows you to indicate that you want representative sequences for each group specified for each OTU, group name should be separated by dashes. ex. groups=A-B-C.\n";
88 helpString += "The get.oturep command outputs a .fastarep and .rep.names file for each distance you specify, selecting one OTU representative for each bin.\n";
89 helpString += "If you provide a groupfile, then it also appends the names of the groups present in that bin.\n";
90 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
94 m->errorOut(e, "GetOTURepCommand", "getHelpString");
98 //**********************************************************************************************************************
99 string GetOTURepCommand::getOutputPattern(string type) {
103 if (type == "fasta") { pattern = "[filename],[tag],rep.fasta-[filename],[tag],[group],rep.fasta"; }
104 else if (type == "name") { pattern = "[filename],[tag],rep.names-[filename],[tag],[group],rep.names"; }
105 else if (type == "count") { pattern = "[filename],[tag],rep.count_table-[filename],[tag],[group],rep.count_table"; }
106 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
110 catch(exception& e) {
111 m->errorOut(e, "GetOTURepCommand", "getOutputPattern");
115 //**********************************************************************************************************************
116 GetOTURepCommand::GetOTURepCommand(){
118 abort = true; calledHelp = true;
120 vector<string> tempOutNames;
121 outputTypes["fasta"] = tempOutNames;
122 outputTypes["name"] = tempOutNames;
123 outputTypes["count"] = tempOutNames;
125 catch(exception& e) {
126 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
130 //**********************************************************************************************************************
131 GetOTURepCommand::GetOTURepCommand(string option) {
133 abort = false; calledHelp = false;
136 //allow user to run help
137 if (option == "help") {
138 help(); abort = true; calledHelp = true;
139 }else if(option == "citation") { citation(); abort = true; calledHelp = true;
141 vector<string> myArray = setParameters();
143 OptionParser parser(option);
144 map<string, string> parameters = parser.getParameters();
146 ValidParameters validParameter;
147 map<string, string>::iterator it;
149 //check to make sure all parameters are valid for command
150 for (it = parameters.begin(); it != parameters.end(); it++) {
151 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
154 //initialize outputTypes
155 vector<string> tempOutNames;
156 outputTypes["fasta"] = tempOutNames;
157 outputTypes["name"] = tempOutNames;
158 outputTypes["count"] = tempOutNames;
160 //if the user changes the input directory command factory will send this info to us in the output parameter
161 string inputDir = validParameter.validFile(parameters, "inputdir", false);
162 if (inputDir == "not found"){ inputDir = ""; }
165 it = parameters.find("list");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["list"] = inputDir + it->second; }
173 it = parameters.find("fasta");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["fasta"] = inputDir + it->second; }
181 it = parameters.find("phylip");
182 //user has given a template file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["phylip"] = inputDir + it->second; }
189 it = parameters.find("column");
190 //user has given a template file
191 if(it != parameters.end()){
192 path = m->hasPath(it->second);
193 //if the user has not given a path then, add inputdir. else leave path alone.
194 if (path == "") { parameters["column"] = inputDir + it->second; }
197 it = parameters.find("name");
198 //user has given a template file
199 if(it != parameters.end()){
200 path = m->hasPath(it->second);
201 //if the user has not given a path then, add inputdir. else leave path alone.
202 if (path == "") { parameters["name"] = inputDir + it->second; }
205 it = parameters.find("group");
206 //user has given a template file
207 if(it != parameters.end()){
208 path = m->hasPath(it->second);
209 //if the user has not given a path then, add inputdir. else leave path alone.
210 if (path == "") { parameters["group"] = inputDir + it->second; }
213 it = parameters.find("count");
214 //user has given a template file
215 if(it != parameters.end()){
216 path = m->hasPath(it->second);
217 //if the user has not given a path then, add inputdir. else leave path alone.
218 if (path == "") { parameters["count"] = inputDir + it->second; }
223 //if the user changes the output directory command factory will send this info to us in the output parameter
224 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
226 //check for required parameters
227 fastafile = validParameter.validFile(parameters, "fasta", true);
228 if (fastafile == "not found") { fastafile = ""; }
229 else if (fastafile == "not open") { abort = true; }
230 else { m->setFastaFile(fastafile); }
232 listfile = validParameter.validFile(parameters, "list", true);
233 if (listfile == "not found") {
234 listfile = m->getListFile();
235 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
236 else { m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
238 else if (listfile == "not open") { abort = true; }
239 else { m->setListFile(listfile); }
241 phylipfile = validParameter.validFile(parameters, "phylip", true);
242 if (phylipfile == "not found") { phylipfile = ""; }
243 else if (phylipfile == "not open") { abort = true; }
244 else { distFile = phylipfile; format = "phylip"; m->setPhylipFile(phylipfile); }
246 columnfile = validParameter.validFile(parameters, "column", true);
247 if (columnfile == "not found") { columnfile = ""; }
248 else if (columnfile == "not open") { abort = true; }
249 else { distFile = columnfile; format = "column"; m->setColumnFile(columnfile); }
251 namefile = validParameter.validFile(parameters, "name", true);
252 if (namefile == "not open") { abort = true; }
253 else if (namefile == "not found") { namefile = ""; }
254 else { m->setNameFile(namefile); }
257 countfile = validParameter.validFile(parameters, "count", true);
258 if (countfile == "not found") { countfile = ""; }
259 else if (countfile == "not open") { abort = true; countfile = ""; }
261 m->setCountTableFile(countfile);
262 ct.readTable(countfile);
263 if (ct.hasGroupInfo()) { hasGroups = true; }
266 if ((namefile != "") && (countfile != "")) {
267 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
270 if ((groupfile != "") && (countfile != "")) {
271 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
274 if ((phylipfile == "") && (columnfile == "")) { //is there are current file available for either of these?
275 //give priority to column, then phylip
276 columnfile = m->getColumnFile();
277 if (columnfile != "") { distFile = columnfile; format = "column"; m->mothurOut("Using " + columnfile + " as input file for the column parameter."); m->mothurOutEndLine(); }
279 phylipfile = m->getPhylipFile();
280 if (phylipfile != "") { distFile = phylipfile; format = "phylip"; m->mothurOut("Using " + phylipfile + " as input file for the phylip parameter."); m->mothurOutEndLine(); }
282 m->mothurOut("No valid current files. You must provide a phylip or column file before you can use the get.oturep command."); m->mothurOutEndLine();
286 }else if ((phylipfile != "") && (columnfile != "")) { m->mothurOut("When executing a get.oturep command you must enter ONLY ONE of the following: phylip or column."); m->mothurOutEndLine(); abort = true; }
288 if (columnfile != "") {
289 if ((namefile == "") && (countfile == "")) {
290 namefile = m->getNameFile();
291 if (namefile != "") { m->mothurOut("Using " + namefile + " as input file for the name parameter."); m->mothurOutEndLine(); }
293 countfile = m->getCountTableFile();
294 if (countfile != "") { m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
296 m->mothurOut("You need to provide a namefile or countfile if you are going to use the column format."); m->mothurOutEndLine();
303 //check for optional parameter and set defaults
304 // ...at some point should added some additional type checking...
305 label = validParameter.validFile(parameters, "label", false);
306 if (label == "not found") { label = ""; allLines = 1; }
308 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
309 else { allLines = 1; }
312 groupfile = validParameter.validFile(parameters, "group", true);
313 if (groupfile == "not open") { groupfile = ""; abort = true; }
314 else if (groupfile == "not found") { groupfile = ""; }
315 else { m->setGroupFile(groupfile); }
317 sorted = validParameter.validFile(parameters, "sorted", false); if (sorted == "not found"){ sorted = ""; }
318 if (sorted == "none") { sorted=""; }
319 if ((sorted != "") && (sorted != "name") && (sorted != "bin") && (sorted != "size") && (sorted != "group")) {
320 m->mothurOut(sorted + " is not a valid option for the sorted parameter. The only options are: name, bin, size and group. I will not sort."); m->mothurOutEndLine();
324 if ((sorted == "group") && ((groupfile == "")&& !hasGroups)) {
325 m->mothurOut("You must provide a groupfile or have a count file with group info to sort by group. I will not sort."); m->mothurOutEndLine();
329 groups = validParameter.validFile(parameters, "groups", false);
330 if (groups == "not found") { groups = ""; }
332 if ((groupfile == "") && (!hasGroups)) {
333 m->mothurOut("You must provide a groupfile to use groups."); m->mothurOutEndLine();
336 m->splitAtDash(groups, Groups);
339 m->setGroups(Groups);
341 string temp = validParameter.validFile(parameters, "large", false); if (temp == "not found") { temp = "F"; }
342 large = m->isTrue(temp);
344 temp = validParameter.validFile(parameters, "weighted", false); if (temp == "not found") { temp = "f"; }
345 weighted = m->isTrue(temp);
347 if ((weighted) && (namefile == "")) { m->mothurOut("You cannot set weighted to true unless you provide a namesfile."); m->mothurOutEndLine(); abort = true; }
349 temp = validParameter.validFile(parameters, "precision", false); if (temp == "not found") { temp = "100"; }
350 m->mothurConvert(temp, precision);
352 temp = validParameter.validFile(parameters, "cutoff", false); if (temp == "not found") { temp = "10.0"; }
353 m->mothurConvert(temp, cutoff);
354 cutoff += (5 / (precision * 10.0));
357 catch(exception& e) {
358 m->errorOut(e, "GetOTURepCommand", "GetOTURepCommand");
363 //**********************************************************************************************************************
365 int GetOTURepCommand::execute(){
368 if (abort == true) { if (calledHelp) { return 0; } return 2; }
374 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); } return 0; }
376 if (groupfile != "") {
377 //read in group map info.
378 groupMap = new GroupMap(groupfile);
379 int error = groupMap->readMap();
380 if (error == 1) { delete groupMap; m->mothurOut("Error reading your groupfile. Proceeding without groupfile."); m->mothurOutEndLine(); groupfile = ""; }
382 if (Groups.size() != 0) {
384 vector<string> gNamesOfGroups = groupMap->getNamesOfGroups();
385 util.setGroups(Groups, gNamesOfGroups, "getoturep");
386 groupMap->setNamesOfGroups(gNamesOfGroups);
388 }else if (hasGroups) {
389 if (Groups.size() != 0) {
391 vector<string> gNamesOfGroups = ct.getNamesOfGroups();
392 util.setGroups(Groups, gNamesOfGroups, "getoturep");
396 //done with listvector from matrix
397 if (list != NULL) { delete list; }
399 input = new InputData(listfile, "list");
400 list = input->getListVector();
401 string lastLabel = list->getLabel();
403 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
404 set<string> processedLabels;
405 set<string> userLabels = labels;
407 if (m->control_pressed) {
408 if (large) { inRow.close(); m->mothurRemove(distFile); }
409 delete input; delete list; return 0;
412 if ((!weighted) && (namefile != "")) { readNamesFile(weighted); }
414 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
416 if (allLines == 1 || labels.count(list->getLabel()) == 1){
417 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
418 error = process(list);
419 if (error == 1) { return 0; } //there is an error in hte input files, abort command
421 if (m->control_pressed) {
422 if (large) { inRow.close(); m->mothurRemove(distFile); }
423 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
424 delete input; delete list; return 0;
427 processedLabels.insert(list->getLabel());
428 userLabels.erase(list->getLabel());
431 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
432 string saveLabel = list->getLabel();
435 list = input->getListVector(lastLabel);
436 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
437 error = process(list);
438 if (error == 1) { return 0; } //there is an error in hte input files, abort command
440 if (m->control_pressed) {
441 if (large) { inRow.close(); m->mothurRemove(distFile); }
442 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
443 delete input; delete list; return 0;
446 processedLabels.insert(list->getLabel());
447 userLabels.erase(list->getLabel());
449 //restore real lastlabel to save below
450 list->setLabel(saveLabel);
453 lastLabel = list->getLabel();
456 list = input->getListVector();
459 //output error messages about any remaining user labels
460 bool needToRun = false;
461 for (set<string>::iterator it = userLabels.begin(); it != userLabels.end(); it++) {
462 m->mothurOut("Your file does not include the label " + (*it));
463 if (processedLabels.count(lastLabel) != 1) {
464 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
467 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
471 //run last label if you need to
472 if (needToRun == true) {
473 if (list != NULL) { delete list; }
474 list = input->getListVector(lastLabel);
475 m->mothurOut(list->getLabel() + "\t" + toString(list->size())); m->mothurOutEndLine();
476 error = process(list);
478 if (error == 1) { return 0; } //there is an error in hte input files, abort command
480 if (m->control_pressed) {
481 if (large) { inRow.close(); m->mothurRemove(distFile); }
482 for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } outputTypes.clear();
483 delete input; delete list; return 0;
487 //close and remove formatted matrix file
490 m->mothurRemove(distFile);
495 if (!weighted) { nameFileMap.clear(); }
498 if (fastafile != "") {
500 fasta = new FastaMap();
501 fasta->readFastaFile(fastafile);
503 //if user gave a namesfile then use it
504 if (namefile != "") { readNamesFile(); }
506 //output create and output the .rep.fasta files
507 map<string, string>::iterator itNameFile;
508 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
509 processFastaNames(itNameFile->first, itNameFile->second);
512 //output create and output the .rep.fasta files
513 map<string, string>::iterator itNameFile;
514 for (itNameFile = outputNameFiles.begin(); itNameFile != outputNameFiles.end(); itNameFile++) {
515 processNames(itNameFile->first, itNameFile->second);
520 if (groupfile != "") { delete groupMap; }
522 if (m->control_pressed) { return 0; }
524 //set fasta file as new current fastafile - use first one??
526 itTypes = outputTypes.find("fasta");
527 if (itTypes != outputTypes.end()) {
528 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
531 itTypes = outputTypes.find("name");
532 if (itTypes != outputTypes.end()) {
533 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
536 itTypes = outputTypes.find("count");
537 if (itTypes != outputTypes.end()) {
538 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
541 m->mothurOutEndLine();
542 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
543 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
544 m->mothurOutEndLine();
548 catch(exception& e) {
549 m->errorOut(e, "GetOTURepCommand", "execute");
553 //**********************************************************************************************************************
554 int GetOTURepCommand::readDist() {
558 //read distance files
559 if (format == "column") { readMatrix = new ReadColumnMatrix(distFile); }
560 else if (format == "phylip") { readMatrix = new ReadPhylipMatrix(distFile); }
561 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
563 readMatrix->setCutoff(cutoff);
565 NameAssignment* nameMap = NULL;
567 nameMap = new NameAssignment(namefile);
569 readMatrix->read(nameMap);
570 }else if (countfile != "") {
571 readMatrix->read(&ct);
573 readMatrix->read(nameMap);
576 if (m->control_pressed) { delete readMatrix; return 0; }
578 list = readMatrix->getListVector();
579 SparseDistanceMatrix* matrix = readMatrix->getDMatrix();
581 // Create a data structure to quickly access the distance information.
582 // It consists of a vector of distance maps, where each map contains
583 // all distances of a certain sequence. Vector and maps are accessed
584 // via the index of a sequence in the distance matrix
585 seqVec = vector<SeqMap>(list->size());
586 for (int i = 0; i < matrix->seqVec.size(); i++) {
587 for (int j = 0; j < matrix->seqVec[i].size(); j++) {
588 if (m->control_pressed) { delete readMatrix; return 0; }
589 //already added everyone else in row
590 if (i < matrix->seqVec[i][j].index) { seqVec[i][matrix->seqVec[i][j].index] = matrix->seqVec[i][j].dist; }
593 //add dummy map for unweighted calc
595 seqVec.push_back(dummy);
601 if (m->control_pressed) { return 0; }
603 //process file and set up indexes
604 if (format == "column") { formatMatrix = new FormatColumnMatrix(distFile); }
605 else if (format == "phylip") { formatMatrix = new FormatPhylipMatrix(distFile); }
606 else { m->mothurOut("File format error."); m->mothurOutEndLine(); return 0; }
608 formatMatrix->setCutoff(cutoff);
610 NameAssignment* nameMap = NULL;
612 nameMap = new NameAssignment(namefile);
614 formatMatrix->read(nameMap);
615 }else if (countfile != "") {
616 formatMatrix->read(&ct);
618 formatMatrix->read(nameMap);
621 if (m->control_pressed) { delete formatMatrix; return 0; }
623 list = formatMatrix->getListVector();
624 distFile = formatMatrix->getFormattedFileName();
626 //positions in file where the distances for each sequence begin
627 //rowPositions[1] = position in file where distance related to sequence 1 start.
628 rowPositions = formatMatrix->getRowPositions();
629 rowPositions.push_back(-1); //dummy row for unweighted calc
634 //openfile for getMap to use
635 m->openInputFile(distFile, inRow);
637 if (m->control_pressed) { inRow.close(); m->mothurRemove(distFile); return 0; }
641 //list bin 0 = first name read in distance matrix, list bin 1 = second name read in distance matrix
643 vector<string> names;
645 //map names to rows in sparsematrix
646 for (int i = 0; i < list->size(); i++) {
648 binnames = list->get(i);
650 m->splitAtComma(binnames, names);
652 for (int j = 0; j < names.size(); j++) {
653 nameToIndex[names[j]] = i;
656 } else { m->mothurOut("error, no listvector."); m->mothurOutEndLine(); }
658 if (m->control_pressed) { if (large) { inRow.close(); m->mothurRemove(distFile); }return 0; }
662 catch(exception& e) {
663 m->errorOut(e, "GetOTURepCommand", "execute");
667 //**********************************************************************************************************************
668 void GetOTURepCommand::readNamesFile() {
671 vector<string> dupNames;
672 m->openInputFile(namefile, in);
674 string name, names, sequence;
677 in >> name; //read from first column A
678 in >> names; //read from second column A,B,C,D
682 //parse names into vector
683 m->splitAtComma(names, dupNames);
685 //store names in fasta map
686 sequence = fasta->getSequence(name);
687 for (int i = 0; i < dupNames.size(); i++) {
688 fasta->push_back(dupNames[i], sequence);
696 catch(exception& e) {
697 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
701 //**********************************************************************************************************************
702 //read names file to find the weighted rep for each bin
703 void GetOTURepCommand::readNamesFile(bool w) {
706 vector<string> dupNames;
707 m->openInputFile(namefile, in);
709 string name, names, sequence;
712 in >> name; m->gobble(in); //read from first column A
713 in >> names; //read from second column A,B,C,D
717 //parse names into vector
718 m->splitAtComma(names, dupNames);
720 for (int i = 0; i < dupNames.size(); i++) {
721 nameFileMap[dupNames[i]] = name;
729 catch(exception& e) {
730 m->errorOut(e, "GetOTURepCommand", "readNamesFile");
734 //**********************************************************************************************************************
735 string GetOTURepCommand::findRep(vector<string> names, string group) {
737 // if only 1 sequence in bin or processing the "unique" label, then
738 // the first sequence of the OTU is the representative one
739 if ((names.size() == 1)) {
742 vector<int> seqIndex; //(names.size());
743 map<string, string>::iterator itNameFile;
744 map<string, int>::iterator itNameIndex;
746 //fill seqIndex and initialize sums
747 for (size_t i = 0; i < names.size(); i++) {
749 seqIndex.push_back(nameToIndex[names[i]]);
750 if (countfile != "") { //if countfile is not blank then we can assume the list file contains only uniques, otherwise we assume list file contains everyone.
752 if (group != "") { numRep = ct.getGroupCount(names[i], group); }
753 else { numRep = ct.getGroupCount(names[i]); }
754 for (int j = 1; j < numRep; j++) { //don't add yourself again
755 seqIndex.push_back(nameToIndex[names[i]]);
759 if (namefile == "") {
760 itNameIndex = nameToIndex.find(names[i]);
762 if (itNameIndex == nameToIndex.end()) { // you are not in the distance file and no namesfile, then assume you are not unique
763 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
764 else { seqIndex.push_back((seqVec.size()-1)); }
766 seqIndex.push_back(itNameIndex->second);
770 itNameFile = nameFileMap.find(names[i]);
772 if (itNameFile == nameFileMap.end()) {
773 m->mothurOut("[ERROR]: " + names[i] + " is not in your namefile, please correct."); m->mothurOutEndLine(); m->control_pressed = true;
775 string name1 = itNameFile->first;
776 string name2 = itNameFile->second;
778 if (name1 == name2) { //then you are unique so add your real dists
779 seqIndex.push_back(nameToIndex[names[i]]);
781 if (large) { seqIndex.push_back((rowPositions.size()-1)); }
782 else { seqIndex.push_back((seqVec.size()-1)); }
789 vector<float> max_dist(seqIndex.size(), 0.0);
790 vector<float> total_dist(seqIndex.size(), 0.0);
792 // loop through all entries in seqIndex
795 for (size_t i=0; i < seqIndex.size(); i++) {
796 if (m->control_pressed) { return "control"; }
798 if (!large) { currMap = seqVec[seqIndex[i]]; }
799 else { currMap = getMap(seqIndex[i]); }
801 for (size_t j=0; j < seqIndex.size(); j++) {
802 it = currMap.find(seqIndex[j]);
803 if (it != currMap.end()) {
804 max_dist[i] = max(max_dist[i], it->second);
805 max_dist[j] = max(max_dist[j], it->second);
806 total_dist[i] += it->second;
807 total_dist[j] += it->second;
808 }else{ //if you can't find the distance make it the cutoff
809 max_dist[i] = max(max_dist[i], cutoff);
810 max_dist[j] = max(max_dist[j], cutoff);
811 total_dist[i] += cutoff;
812 total_dist[j] += cutoff;
817 // sequence with the smallest maximum distance is the representative
818 //if tie occurs pick sequence with smallest average distance
821 for (size_t i=0; i < max_dist.size(); i++) {
822 if (m->control_pressed) { return "control"; }
823 if (max_dist[i] < min) {
826 }else if (max_dist[i] == min) {
827 float currentAverage = total_dist[minIndex] / (float) total_dist.size();
828 float newAverage = total_dist[i] / (float) total_dist.size();
830 if (newAverage < currentAverage) {
837 return(names[minIndex]);
840 catch(exception& e) {
841 m->errorOut(e, "GetOTURepCommand", "FindRep");
846 //**********************************************************************************************************************
847 int GetOTURepCommand::process(ListVector* processList) {
849 string name, sequence;
853 if (outputDir == "") { outputDir += m->hasPath(listfile); }
855 ofstream newNamesOutput;
856 string outputNamesFile;
857 map<string, ofstream*> filehandles;
859 map<string, string> variables;
860 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
862 if (Groups.size() == 0) { //you don't want to use groups
863 variables["[tag]"] = processList->getLabel();
864 if (countfile == "") {
865 outputNamesFile = getOutputFileName("name", variables);
866 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
868 outputNamesFile = getOutputFileName("count", variables);
869 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
871 outputNameFiles[outputNamesFile] = processList->getLabel();
872 m->openOutputFile(outputNamesFile, newNamesOutput);
873 newNamesOutput << "noGroup" << endl;
874 }else{ //you want to use groups
876 for (int i=0; i<Groups.size(); i++) {
878 variables["[tag]"] = processList->getLabel();
879 variables["[group]"] = Groups[i];
880 filehandles[Groups[i]] = temp;
881 outputNamesFile = outputDir + m->getRootName(m->getSimpleName(listfile)) + processList->getLabel() + "." + Groups[i] + ".";
882 if (countfile == "") {
883 outputNamesFile = getOutputFileName("name", variables);
884 outputNames.push_back(outputNamesFile); outputTypes["name"].push_back(outputNamesFile);
886 outputNamesFile = getOutputFileName("count", variables);
887 outputNames.push_back(outputNamesFile); outputTypes["count"].push_back(outputNamesFile);
890 m->openOutputFile(outputNamesFile, *(temp));
891 *(temp) << Groups[i] << endl;
892 outputNameFiles[outputNamesFile] = processList->getLabel() + "." + Groups[i];
896 //for each bin in the list vector
897 for (int i = 0; i < processList->size(); i++) {
898 if (m->control_pressed) {
900 if (Groups.size() == 0) { //you don't want to use groups
901 newNamesOutput.close();
903 for (int j=0; j<Groups.size(); j++) {
904 (*(filehandles[Groups[j]])).close();
905 delete filehandles[Groups[j]];
911 string temp = processList->get(i);
912 vector<string> namesInBin;
913 m->splitAtComma(temp, namesInBin);
915 if (Groups.size() == 0) {
916 nameRep = findRep(namesInBin, "");
917 newNamesOutput << i << '\t' << nameRep << '\t' << processList->get(i) << endl;
919 map<string, vector<string> > NamesInGroup;
920 for (int j=0; j<Groups.size(); j++) { //initialize groups
921 NamesInGroup[Groups[j]].resize(0);
924 for (int j=0; j<namesInBin.size(); j++) {
925 if (groupfile != "") {
926 string thisgroup = groupMap->getGroup(namesInBin[j]);
927 if (thisgroup == "not found") { m->mothurOut(namesInBin[j] + " is not in your groupfile, please correct."); m->mothurOutEndLine(); m->control_pressed = true; }
929 //add this name to correct group
930 if (m->inUsersGroups(thisgroup, Groups)) { NamesInGroup[thisgroup].push_back(namesInBin[j]); }
932 vector<string> thisSeqsGroups = ct.getGroups(namesInBin[j]);
933 for (int k = 0; k < thisSeqsGroups.size(); k++) {
934 if (m->inUsersGroups(thisSeqsGroups[k], Groups)) { NamesInGroup[thisSeqsGroups[k]].push_back(namesInBin[j]); }
939 //get rep for each group in otu
940 for (int j=0; j<Groups.size(); j++) {
941 if (NamesInGroup[Groups[j]].size() != 0) { //are there members from this group in this otu?
942 //get rep for each group
943 nameRep = findRep(NamesInGroup[Groups[j]], Groups[j]);
945 //output group rep and other members of this group
946 (*(filehandles[Groups[j]])) << i << '\t' << nameRep << '\t';
948 for (int k=0; k<NamesInGroup[Groups[j]].size()-1; k++) {//output list of names in this otu from this group
949 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][k] << ",";
952 (*(filehandles[Groups[j]])) << NamesInGroup[Groups[j]][NamesInGroup[Groups[j]].size()-1] << endl;
958 if (Groups.size() == 0) { //you don't want to use groups
959 newNamesOutput.close();
961 for (int i=0; i<Groups.size(); i++) {
962 (*(filehandles[Groups[i]])).close();
963 delete filehandles[Groups[i]];
970 catch(exception& e) {
971 m->errorOut(e, "GetOTURepCommand", "process");
975 //**********************************************************************************************************************
976 int GetOTURepCommand::processFastaNames(string filename, string label) {
980 if (outputDir == "") { outputDir += m->hasPath(listfile); }
981 map<string, string> variables;
982 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile));
983 variables["[tag]"] = label;
984 string outputFileName = getOutputFileName("fasta",variables);
985 m->openOutputFile(outputFileName, out);
986 vector<repStruct> reps;
987 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
990 string tempNameFile = filename + ".temp";
991 m->openOutputFile(tempNameFile, out2);
994 m->openInputFile(filename, in);
997 string tempGroup = "";
998 in >> tempGroup; m->gobble(in);
1001 if (countfile != "") {
1002 thisCt.readTable(countfile);
1003 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1008 string rep, binnames;
1009 in >> i >> rep >> binnames; m->gobble(in);
1011 vector<string> names;
1012 m->splitAtComma(binnames, names);
1013 int binsize = names.size();
1015 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1017 if (tempGroup == "noGroup") {
1018 for (int j = 0; j < names.size(); j++) {
1019 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1021 binsize = thisCt.getNumSeqs(rep);
1024 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1025 out2 << rep << '\t' << total << '\t' << total << endl;
1029 thistotal += binsize;
1030 //if you have a groupfile
1032 map<string, string> groups;
1033 map<string, string>::iterator groupIt;
1034 if (groupfile != "") {
1035 //find the groups that are in this bin
1036 for (int i = 0; i < names.size(); i++) {
1037 string groupName = groupMap->getGroup(names[i]);
1038 if (groupName == "not found") {
1039 m->mothurOut(names[i] + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
1042 groups[groupName] = groupName;
1046 //turn the groups into a string
1047 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1048 group += groupIt->first + "-";
1051 group = group.substr(0, group.length()-1);
1052 }else if (hasGroups) {
1053 map<string, string> groups;
1054 for (int i = 0; i < names.size(); i++) {
1055 vector<string> thisSeqsGroups = ct.getGroups(names[i]);
1056 for (int j = 0; j < thisSeqsGroups.size(); j++) { groups[thisSeqsGroups[j]] = thisSeqsGroups[j]; }
1058 //turn the groups into a string
1059 for (groupIt = groups.begin(); groupIt != groups.end(); groupIt++) {
1060 group += groupIt->first + "-";
1063 group = group.substr(0, group.length()-1);
1064 //cout << group << endl;
1069 //print out name and sequence for that bin
1070 string sequence = fasta->getSequence(rep);
1072 if (sequence != "not found") {
1073 if (sorted == "") { //print them out
1074 rep = rep + "\t" + toString(i+1);
1075 rep = rep + "|" + toString(binsize);
1077 rep = rep + "|" + group;
1079 out << ">" << rep << endl;
1080 out << sequence << endl;
1082 repStruct newRep(rep, i+1, binsize, group);
1083 reps.push_back(newRep);
1086 m->mothurOut(rep + " is missing from your fasta or name file, ignoring. Please correct."); m->mothurOutEndLine();
1091 if (sorted != "") { //then sort them and print them
1092 if (sorted == "name") { sort(reps.begin(), reps.end(), compareName); }
1093 else if (sorted == "bin") { sort(reps.begin(), reps.end(), compareBin); }
1094 else if (sorted == "size") { sort(reps.begin(), reps.end(), compareSize); }
1095 else if (sorted == "group") { sort(reps.begin(), reps.end(), compareGroup); }
1098 for (int i = 0; i < reps.size(); i++) {
1099 string sequence = fasta->getSequence(reps[i].name);
1100 string outputName = reps[i].name + "\t" + toString(reps[i].bin);
1101 outputName = outputName + "|" + toString(reps[i].size);
1102 if (reps[i].group != "") {
1103 outputName = outputName + "|" + reps[i].group;
1105 out << ">" << outputName << endl;
1106 out << sequence << endl;
1114 m->mothurRemove(filename);
1115 rename(tempNameFile.c_str(), filename.c_str());
1117 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1122 catch(exception& e) {
1123 m->errorOut(e, "GetOTURepCommand", "processFastaNames");
1127 //**********************************************************************************************************************
1128 int GetOTURepCommand::processNames(string filename, string label) {
1131 //create output file
1132 if (outputDir == "") { outputDir += m->hasPath(listfile); }
1135 string tempNameFile = filename + ".temp";
1136 m->openOutputFile(tempNameFile, out2);
1139 m->openInputFile(filename, in);
1142 string rep, binnames;
1144 string tempGroup = "";
1145 in >> tempGroup; m->gobble(in);
1148 if (countfile != "") {
1149 thisCt.readTable(countfile);
1150 if (tempGroup != "noGroup") { out2 << "Representative_Sequence\ttotal\t" << tempGroup << endl; }
1154 if (m->control_pressed) { break; }
1155 in >> i >> rep >> binnames; m->gobble(in);
1157 if (countfile == "") { out2 << rep << '\t' << binnames << endl; }
1159 vector<string> names;
1160 m->splitAtComma(binnames, names);
1161 if (tempGroup == "noGroup") {
1162 for (int j = 0; j < names.size(); j++) {
1163 if (names[j] != rep) { thisCt.mergeCounts(rep, names[j]); }
1167 for (int j = 0; j < names.size(); j++) { total += thisCt.getGroupCount(names[j], tempGroup); }
1168 out2 << rep << '\t' << total << '\t' << total << endl;
1176 m->mothurRemove(filename);
1177 rename(tempNameFile.c_str(), filename.c_str());
1179 if ((countfile != "") && (tempGroup == "noGroup")) { thisCt.printTable(filename); }
1183 catch(exception& e) {
1184 m->errorOut(e, "GetOTURepCommand", "processNames");
1188 //**********************************************************************************************************************
1189 SeqMap GetOTURepCommand::getMap(int row) {
1193 //make sure this row exists in the file, it may not if the seq did not have any distances below the cutoff
1194 if (rowPositions[row] != -1){
1196 inRow.seekg(rowPositions[row]);
1198 int rowNum, numDists, colNum;
1201 inRow >> rowNum >> numDists;
1203 for(int i = 0; i < numDists; i++) {
1204 inRow >> colNum >> dist;
1205 rowMap[colNum] = dist;
1212 catch(exception& e) {
1213 m->errorOut(e, "GetOTURepCommand", "getMap");
1217 //**********************************************************************************************************************