5 // Created by Sarah Westcott on 4/16/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "makebiomcommand.h"
10 #include "sharedrabundvector.h"
11 #include "inputdata.h"
13 //taken from http://biom-format.org/documentation/biom_format.html
17 "format": "Biological Observation Matrix 0.9.1",
18 "format_url": "http://biom-format.org",
20 "generated_by": "QIIME revision 1.4.0-dev",
21 "date": "2011-12-19T19:00:00",
23 {"id":"GG_OTU_1", "metadata":null},
24 {"id":"GG_OTU_2", "metadata":null},
25 {"id":"GG_OTU_3", "metadata":null},
26 {"id":"GG_OTU_4", "metadata":null},
27 {"id":"GG_OTU_5", "metadata":null}
30 {"id":"Sample1", "metadata":null},
31 {"id":"Sample2", "metadata":null},
32 {"id":"Sample3", "metadata":null},
33 {"id":"Sample4", "metadata":null},
34 {"id":"Sample5", "metadata":null},
35 {"id":"Sample6", "metadata":null}
37 "matrix_type": "sparse",
38 "matrix_element_type": "int",
61 "format": "Biological Observation Matrix 0.9.1",
62 "format_url": "http://biom-format.org",
64 "generated_by": "QIIME revision 1.4.0-dev",
65 "date": "2011-12-19T19:00:00",
67 {"id":"GG_OTU_1", "metadata":null},
68 {"id":"GG_OTU_2", "metadata":null},
69 {"id":"GG_OTU_3", "metadata":null},
70 {"id":"GG_OTU_4", "metadata":null},
71 {"id":"GG_OTU_5", "metadata":null}
74 {"id":"Sample1", "metadata":null},
75 {"id":"Sample2", "metadata":null},
76 {"id":"Sample3", "metadata":null},
77 {"id":"Sample4", "metadata":null},
78 {"id":"Sample5", "metadata":null},
79 {"id":"Sample6", "metadata":null}
81 "matrix_type": "dense",
82 "matrix_element_type": "int",
84 "data": [[0,0,1,0,0,0],
91 //**********************************************************************************************************************
92 vector<string> MakeBiomCommand::setParameters(){
94 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pshared);
95 CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pcontaxonomy);
96 CommandParameter pgroups("groups", "String", "", "", "", "", "",false,false); parameters.push_back(pgroups);
97 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
98 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
99 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
100 CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "",false,false); parameters.push_back(pmatrixtype);
102 vector<string> myArray;
103 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
106 catch(exception& e) {
107 m->errorOut(e, "MakeBiomCommand", "setParameters");
111 //**********************************************************************************************************************
112 string MakeBiomCommand::getHelpString(){
114 string helpString = "";
115 helpString += "The make.biom command parameters are shared, contaxonomy, groups, matrixtype and label. shared is required, unless you have a valid current file.\n";
116 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
117 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
118 helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
119 helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled.\n";
120 helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
121 helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
122 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
123 helpString += "The make.biom command outputs a .biom file.\n";
124 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
127 catch(exception& e) {
128 m->errorOut(e, "MakeBiomCommand", "getHelpString");
132 //**********************************************************************************************************************
133 string MakeBiomCommand::getOutputFileNameTag(string type, string inputName=""){
135 string outputFileName = "";
136 map<string, vector<string> >::iterator it;
138 //is this a type this command creates
139 it = outputTypes.find(type);
140 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
142 if (type == "biom") { outputFileName = "biom"; }
143 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
145 return outputFileName;
147 catch(exception& e) {
148 m->errorOut(e, "MakeBiomCommand", "getOutputFileNameTag");
153 //**********************************************************************************************************************
154 MakeBiomCommand::MakeBiomCommand(){
156 abort = true; calledHelp = true;
158 vector<string> tempOutNames;
159 outputTypes["biom"] = tempOutNames;
161 catch(exception& e) {
162 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
166 //**********************************************************************************************************************
167 MakeBiomCommand::MakeBiomCommand(string option) {
169 abort = false; calledHelp = false;
172 //allow user to run help
173 if(option == "help") { help(); abort = true; calledHelp = true; }
174 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
177 vector<string> myArray = setParameters();
179 OptionParser parser(option);
180 map<string,string> parameters = parser.getParameters();
181 map<string,string>::iterator it;
183 ValidParameters validParameter;
185 //check to make sure all parameters are valid for command
186 for (it = parameters.begin(); it != parameters.end(); it++) {
187 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
190 //initialize outputTypes
191 vector<string> tempOutNames;
192 outputTypes["biom"] = tempOutNames;
194 //if the user changes the input directory command factory will send this info to us in the output parameter
195 string inputDir = validParameter.validFile(parameters, "inputdir", false);
196 if (inputDir == "not found"){ inputDir = ""; }
199 it = parameters.find("shared");
200 //user has given a template file
201 if(it != parameters.end()){
202 path = m->hasPath(it->second);
203 //if the user has not given a path then, add inputdir. else leave path alone.
204 if (path == "") { parameters["shared"] = inputDir + it->second; }
207 it = parameters.find("contaxonomy");
208 //user has given a template file
209 if(it != parameters.end()){
210 path = m->hasPath(it->second);
211 //if the user has not given a path then, add inputdir. else leave path alone.
212 if (path == "") { parameters["contaxonomy"] = inputDir + it->second; }
217 sharedfile = validParameter.validFile(parameters, "shared", true);
218 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
219 else if (sharedfile == "not found") {
220 //if there is a current shared file, use it
221 sharedfile = m->getSharedFile();
222 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
223 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
224 }else { m->setSharedFile(sharedfile); }
227 //if the user changes the output directory command factory will send this info to us in the output parameter
228 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(sharedfile); }
230 contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
231 if (contaxonomyfile == "not found") { contaxonomyfile = ""; }
232 else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
235 //check for optional parameter and set defaults
236 // ...at some point should added some additional type checking...
237 label = validParameter.validFile(parameters, "label", false);
238 if (label == "not found") { label = ""; }
240 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
241 else { allLines = 1; }
244 groups = validParameter.validFile(parameters, "groups", false);
245 if (groups == "not found") { groups = ""; }
247 m->splitAtDash(groups, Groups);
248 m->setGroups(Groups);
251 if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
253 format = validParameter.validFile(parameters, "matrixtype", false); if (format == "not found") { format = "sparse"; }
255 if ((format != "sparse") && (format != "dense")) {
256 m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true;
261 catch(exception& e) {
262 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
266 //**********************************************************************************************************************
268 int MakeBiomCommand::execute(){
271 if (abort == true) { if (calledHelp) { return 0; } return 2; }
273 InputData input(sharedfile, "sharedfile");
274 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
275 string lastLabel = lookup[0]->getLabel();
277 //if user did not specify a label, then use first one
278 if ((contaxonomyfile != "") && (labels.size() == 0)) {
280 labels.insert(lastLabel);
283 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
284 set<string> processedLabels;
285 set<string> userLabels = labels;
287 //as long as you are not at the end of the file or done wih the lines you want
288 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
290 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
292 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
294 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
297 processedLabels.insert(lookup[0]->getLabel());
298 userLabels.erase(lookup[0]->getLabel());
301 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
302 string saveLabel = lookup[0]->getLabel();
304 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
305 lookup = input.getSharedRAbundVectors(lastLabel);
306 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
310 processedLabels.insert(lookup[0]->getLabel());
311 userLabels.erase(lookup[0]->getLabel());
313 //restore real lastlabel to save below
314 lookup[0]->setLabel(saveLabel);
317 lastLabel = lookup[0]->getLabel();
319 //prevent memory leak and get next set
320 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
321 lookup = input.getSharedRAbundVectors();
324 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
326 //output error messages about any remaining user labels
327 set<string>::iterator it;
328 bool needToRun = false;
329 for (it = userLabels.begin(); it != userLabels.end(); it++) {
330 m->mothurOut("Your file does not include the label " + *it);
331 if (processedLabels.count(lastLabel) != 1) {
332 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
335 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
339 //run last label if you need to
340 if (needToRun == true) {
341 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
342 lookup = input.getSharedRAbundVectors(lastLabel);
344 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
347 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
350 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
352 //set sabund file as new current sabundfile
354 itTypes = outputTypes.find("biom");
355 if (itTypes != outputTypes.end()) {
356 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
360 m->mothurOutEndLine();
361 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
362 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
363 m->mothurOutEndLine();
367 catch(exception& e) {
368 m->errorOut(e, "MakeBiomCommand", "execute");
372 //**********************************************************************************************************************
373 int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
376 string outputFileName = outputDir + m->getRootName(m->getSimpleName(sharedfile)) + lookup[0]->getLabel() + "." + getOutputFileNameTag("biom");
378 m->openOutputFile(outputFileName, out);
379 outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
381 string mothurString = "mothur" + toString(m->getVersion());
383 struct tm * timeinfo;
385 timeinfo = localtime ( &rawtime );
386 string dateString = asctime (timeinfo);
387 int pos = dateString.find('\n');
388 if (pos != string::npos) { dateString = dateString.substr(0, pos);}
392 out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
393 out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
395 vector<string> metadata = getMetaData(lookup);
397 if (m->control_pressed) { out.close(); return 0; }
401 {"id":"GG_OTU_1", "metadata":null},
402 {"id":"GG_OTU_2", "metadata":null},
403 {"id":"GG_OTU_3", "metadata":null},
404 {"id":"GG_OTU_4", "metadata":null},
405 {"id":"GG_OTU_5", "metadata":null}
407 out << spaces + "\"rows\":[\n";
408 string rowFront = spaces + spaces + "{\"id\":\"";
409 string rowBack = "\", \"metadata\":";
410 for (int i = 0; i < m->currentBinLabels.size()-1; i++) {
411 if (m->control_pressed) { out.close(); return 0; }
412 out << rowFront << m->currentBinLabels[i] << rowBack << metadata[i] << "},\n";
414 out << rowFront << m->currentBinLabels[(m->currentBinLabels.size()-1)] << rowBack << metadata[(m->currentBinLabels.size()-1)] << "}\n" + spaces + "],\n";
418 {"id":"Sample1", "metadata":null},
419 {"id":"Sample2", "metadata":null},
420 {"id":"Sample3", "metadata":null},
421 {"id":"Sample4", "metadata":null},
422 {"id":"Sample5", "metadata":null},
423 {"id":"Sample6", "metadata":null}
426 string colBack = "\", \"metadata\":null}";
427 out << spaces + "\"columns\":[\n";
428 for (int i = 0; i < lookup.size()-1; i++) {
429 if (m->control_pressed) { out.close(); return 0; }
430 out << rowFront << lookup[i]->getGroup() << colBack << ",\n";
432 out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << "\n" + spaces + "],\n";
434 out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
435 out << spaces + "\"shape\": [" << m->currentBinLabels.size() << "," << lookup.size() << "],\n";
436 out << spaces + "\"data\": [";
438 vector<string> dataRows;
439 if (format == "sparse") {
457 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
459 if (m->control_pressed) { out.close(); return 0; }
461 for (int j = 0; j < lookup.size(); j++) {
462 string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
463 //only print non zero values
464 if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
469 /* "matrix_type": "dense",
470 "matrix_element_type": "int",
472 "data": [[0,0,1,0,0,0],
478 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
480 if (m->control_pressed) { out.close(); return 0; }
482 string binInfo = "[";
483 for (int j = 0; j < lookup.size()-1; j++) {
484 binInfo += toString(lookup[j]->getAbundance(i)) + ",";
486 binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
487 dataRows.push_back(binInfo);
491 for (int i = 0; i < dataRows.size()-1; i++) {
492 out << dataRows[i] << ",\n" + spaces + spaces;
494 out << dataRows[dataRows.size()-1] << "]\n";
501 catch(exception& e) {
502 m->errorOut(e, "MakeBiomCommand", "getBiom");
506 //**********************************************************************************************************************
507 vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
509 vector<string> metadata;
511 if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) { metadata.push_back("null"); } }
514 //read constaxonomy file storing in a map, otulabel -> taxonomy
515 //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
517 m->openInputFile(contaxonomyfile, in);
520 m->getline(in); m->gobble(in);
522 string otuLabel, tax;
524 vector<string> otuLabels;
528 if (m->control_pressed) { in.close(); return metadata; }
530 in >> otuLabel >> size >> tax; m->gobble(in);
532 otuLabels.push_back(otuLabel);
537 //should the labels be Otu001 or PhyloType001
538 string firstBin = m->currentBinLabels[0];
539 string binTag = "Otu";
540 if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType"; }
542 //convert list file bin labels to shared file bin labels
545 map<string, string> labelTaxMap;
546 string snumBins = toString(otuLabels.size());
547 for (int i = 0; i < otuLabels.size(); i++) {
549 if (m->control_pressed) { return metadata; }
551 //if there is a bin label use it otherwise make one
552 if (m->isContainingOnlyDigits(otuLabels[i])) {
553 string binLabel = binTag;
554 string sbinNumber = otuLabels[i];
555 if (sbinNumber.length() < snumBins.length()) {
556 int diff = snumBins.length() - sbinNumber.length();
557 for (int h = 0; h < diff; h++) { binLabel += "0"; }
559 binLabel += sbinNumber;
560 labelTaxMap[binLabel] = taxs[i];
561 }else { labelTaxMap[otuLabels[i]] = taxs[i]; }
565 //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
567 //traverse the binLabels forming the metadata strings and saving them
568 //make sure to sanity check
569 map<string, string>::iterator it;
570 for (int i = 0; i < m->currentBinLabels.size(); i++) {
572 if (m->control_pressed) { return metadata; }
574 it = labelTaxMap.find(m->currentBinLabels[i]);
576 if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentBinLabels[i] + ".\n"); m->control_pressed = true; }
578 vector<string> bootstrapValues;
579 string data = "{\"taxonomy\":[";
581 vector<string> scores;
582 vector<string> taxonomies = parseTax(it->second, scores);
584 for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
585 data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
587 //add bootstrap values if available
588 if (scores[0] != "null") {
589 data += ", \"bootstrap\":[";
591 for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
592 data += scores[scores.size()-1] + "]";
597 metadata.push_back(data);
605 catch(exception& e) {
606 m->errorOut(e, "MakeBiomCommand", "getMetadata");
611 /**************************************************************************************************/
612 //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
613 vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
619 while (tax.find_first_of(';') != -1) {
621 if (m->control_pressed) { return taxs; }
624 taxon = tax.substr(0,tax.find_first_of(';'));
626 int pos = taxon.find_last_of('(');
629 int pos2 = taxon.find_last_of(')');
631 string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
632 if (m->isNumeric1(confidenceScore)) {
633 taxon = taxon.substr(0, pos); //rip off confidence
634 scores.push_back(confidenceScore);
635 }else{ scores.push_back("null"); }
637 }else{ scores.push_back("null"); }
639 //strip "" if they are there
640 pos = taxon.find("\"");
641 if (pos != string::npos) {
643 for (int k = 0; k < taxon.length(); k++) {
644 if (taxon[k] != '\"') { newTax += taxon[k]; }
649 //look for bootstrap value
650 taxs.push_back(taxon);
651 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
656 catch(exception& e) {
657 m->errorOut(e, "MakeBiomCommand", "parseTax");
662 //**********************************************************************************************************************