5 // Created by Sarah Westcott on 4/16/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "makebiomcommand.h"
10 #include "sharedrabundvector.h"
11 #include "inputdata.h"
12 #include "sharedutilities.h"
13 #include "phylotree.h"
15 //taken from http://biom-format.org/documentation/biom_format.html
19 "format": "Biological Observation Matrix 0.9.1",
20 "format_url": "http://biom-format.org",
22 "generated_by": "QIIME revision 1.4.0-dev",
23 "date": "2011-12-19T19:00:00",
25 {"id":"GG_OTU_1", "metadata":null},
26 {"id":"GG_OTU_2", "metadata":null},
27 {"id":"GG_OTU_3", "metadata":null},
28 {"id":"GG_OTU_4", "metadata":null},
29 {"id":"GG_OTU_5", "metadata":null}
32 {"id":"Sample1", "metadata":null},
33 {"id":"Sample2", "metadata":null},
34 {"id":"Sample3", "metadata":null},
35 {"id":"Sample4", "metadata":null},
36 {"id":"Sample5", "metadata":null},
37 {"id":"Sample6", "metadata":null}
39 "matrix_type": "sparse",
40 "matrix_element_type": "int",
63 "format": "Biological Observation Matrix 0.9.1",
64 "format_url": "http://biom-format.org",
66 "generated_by": "QIIME revision 1.4.0-dev",
67 "date": "2011-12-19T19:00:00",
69 {"id":"GG_OTU_1", "metadata":null},
70 {"id":"GG_OTU_2", "metadata":null},
71 {"id":"GG_OTU_3", "metadata":null},
72 {"id":"GG_OTU_4", "metadata":null},
73 {"id":"GG_OTU_5", "metadata":null}
76 {"id":"Sample1", "metadata":null},
77 {"id":"Sample2", "metadata":null},
78 {"id":"Sample3", "metadata":null},
79 {"id":"Sample4", "metadata":null},
80 {"id":"Sample5", "metadata":null},
81 {"id":"Sample6", "metadata":null}
83 "matrix_type": "dense",
84 "matrix_element_type": "int",
86 "data": [[0,0,1,0,0,0],
93 //**********************************************************************************************************************
94 vector<string> MakeBiomCommand::setParameters(){
96 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","biom",false,true,true); parameters.push_back(pshared);
97 CommandParameter pcontaxonomy("constaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pcontaxonomy);
98 CommandParameter preference("reftaxonomy", "InputTypes", "", "", "none", "none", "refPi","",false,false); parameters.push_back(preference);
99 CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pmetadata);
100 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
101 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
102 CommandParameter ppicrust("picrust", "InputTypes", "", "", "none", "none", "refPi","shared",false,false); parameters.push_back(ppicrust);
103 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
104 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
105 CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "","",false,false); parameters.push_back(pmatrixtype);
107 vector<string> myArray;
108 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
111 catch(exception& e) {
112 m->errorOut(e, "MakeBiomCommand", "setParameters");
116 //**********************************************************************************************************************
117 string MakeBiomCommand::getHelpString(){
119 string helpString = "";
120 helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype, picrust, reftaxonomy and label. shared is required, unless you have a valid current file.\n"; //
121 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
122 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
123 helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
124 helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n";
125 helpString += "The metadata parameter is used to provide experimental parameters to the columns. Things like 'sample1 gut human_gut'. \n";
126 helpString += "The picrust parameter is used to provide the greengenes OTU IDs map table. NOTE: Picrust requires a greengenes taxonomy. \n";
127 helpString += "The referencetax parameter is used with the picrust parameter. Picrust requires the greengenes OTU IDs to be in the biom file. \n";
128 helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
129 helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
130 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
131 helpString += "The make.biom command outputs a .biom file.\n";
132 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
135 catch(exception& e) {
136 m->errorOut(e, "MakeBiomCommand", "getHelpString");
140 //**********************************************************************************************************************
141 string MakeBiomCommand::getOutputPattern(string type) {
145 if (type == "biom") { pattern = "[filename],[distance],biom"; }
146 else if (type == "shared") { pattern = "[filename],[distance],biom_shared"; }
147 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
151 catch(exception& e) {
152 m->errorOut(e, "MakeBiomCommand", "getOutputPattern");
157 //**********************************************************************************************************************
158 MakeBiomCommand::MakeBiomCommand(){
160 abort = true; calledHelp = true;
162 vector<string> tempOutNames;
163 outputTypes["biom"] = tempOutNames;
164 outputTypes["shared"] = tempOutNames;
166 catch(exception& e) {
167 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
171 //**********************************************************************************************************************
172 MakeBiomCommand::MakeBiomCommand(string option) {
174 abort = false; calledHelp = false;
177 //allow user to run help
178 if(option == "help") { help(); abort = true; calledHelp = true; }
179 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
182 vector<string> myArray = setParameters();
184 OptionParser parser(option);
185 map<string,string> parameters = parser.getParameters();
186 map<string,string>::iterator it;
188 ValidParameters validParameter;
190 //check to make sure all parameters are valid for command
191 for (it = parameters.begin(); it != parameters.end(); it++) {
192 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
195 //initialize outputTypes
196 vector<string> tempOutNames;
197 outputTypes["biom"] = tempOutNames;
198 outputTypes["shared"] = tempOutNames;
200 //if the user changes the input directory command factory will send this info to us in the output parameter
201 string inputDir = validParameter.validFile(parameters, "inputdir", false);
202 if (inputDir == "not found"){ inputDir = ""; }
205 it = parameters.find("shared");
206 //user has given a template file
207 if(it != parameters.end()){
208 path = m->hasPath(it->second);
209 //if the user has not given a path then, add inputdir. else leave path alone.
210 if (path == "") { parameters["shared"] = inputDir + it->second; }
213 it = parameters.find("constaxonomy");
214 //user has given a template file
215 if(it != parameters.end()){
216 path = m->hasPath(it->second);
217 //if the user has not given a path then, add inputdir. else leave path alone.
218 if (path == "") { parameters["constaxonomy"] = inputDir + it->second; }
221 it = parameters.find("reftaxonomy");
222 //user has given a template file
223 if(it != parameters.end()){
224 path = m->hasPath(it->second);
225 //if the user has not given a path then, add inputdir. else leave path alone.
226 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
229 it = parameters.find("picrust");
230 //user has given a template file
231 if(it != parameters.end()){
232 path = m->hasPath(it->second);
233 //if the user has not given a path then, add inputdir. else leave path alone.
234 if (path == "") { parameters["picrust"] = inputDir + it->second; }
237 it = parameters.find("metadata");
238 //user has given a template file
239 if(it != parameters.end()){
240 path = m->hasPath(it->second);
241 //if the user has not given a path then, add inputdir. else leave path alone.
242 if (path == "") { parameters["metadata"] = inputDir + it->second; }
247 sharedfile = validParameter.validFile(parameters, "shared", true);
248 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
249 else if (sharedfile == "not found") {
250 //if there is a current shared file, use it
251 sharedfile = m->getSharedFile();
252 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
253 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
254 }else { m->setSharedFile(sharedfile); }
257 //if the user changes the output directory command factory will send this info to us in the output parameter
258 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(sharedfile); }
260 contaxonomyfile = validParameter.validFile(parameters, "constaxonomy", true);
261 if (contaxonomyfile == "not found") { contaxonomyfile = ""; }
262 else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
264 referenceTax = validParameter.validFile(parameters, "reftaxonomy", true);
265 if (referenceTax == "not found") { referenceTax = ""; }
266 else if (referenceTax == "not open") { referenceTax = ""; abort = true; }
268 picrustOtuFile = validParameter.validFile(parameters, "picrust", true);
269 if (picrustOtuFile == "not found") { picrustOtuFile = ""; }
270 else if (picrustOtuFile == "not open") { picrustOtuFile = ""; abort = true; }
272 metadatafile = validParameter.validFile(parameters, "metadata", true);
273 if (metadatafile == "not found") { metadatafile = ""; }
274 else if (metadatafile == "not open") { metadatafile = ""; abort = true; }
276 //check for optional parameter and set defaults
277 // ...at some point should added some additional type checking...
278 label = validParameter.validFile(parameters, "label", false);
279 if (label == "not found") { label = ""; }
281 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
282 else { allLines = 1; }
285 groups = validParameter.validFile(parameters, "groups", false);
286 if (groups == "not found") { groups = ""; }
288 m->splitAtDash(groups, Groups);
289 m->setGroups(Groups);
292 if (picrustOtuFile != "") {
294 if (contaxonomyfile == "") { m->mothurOut("[ERROR]: the constaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true; }
295 if (referenceTax == "") { m->mothurOut("[ERROR]: the reftaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true; }
296 }else { picrust=false; }
298 if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
300 format = validParameter.validFile(parameters, "matrixtype", false); if (format == "not found") { format = "sparse"; }
302 if ((format != "sparse") && (format != "dense")) {
303 m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true;
308 catch(exception& e) {
309 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
313 //**********************************************************************************************************************
315 int MakeBiomCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
320 InputData input(sharedfile, "sharedfile");
321 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
322 string lastLabel = lookup[0]->getLabel();
324 //if user did not specify a label, then use first one
325 if ((contaxonomyfile != "") && (labels.size() == 0)) {
327 labels.insert(lastLabel);
330 getSampleMetaData(lookup);
332 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333 set<string> processedLabels;
334 set<string> userLabels = labels;
336 //as long as you are not at the end of the file or done wih the lines you want
337 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
339 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
341 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
343 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
346 processedLabels.insert(lookup[0]->getLabel());
347 userLabels.erase(lookup[0]->getLabel());
350 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
351 string saveLabel = lookup[0]->getLabel();
353 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
354 lookup = input.getSharedRAbundVectors(lastLabel);
355 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
359 processedLabels.insert(lookup[0]->getLabel());
360 userLabels.erase(lookup[0]->getLabel());
362 //restore real lastlabel to save below
363 lookup[0]->setLabel(saveLabel);
366 lastLabel = lookup[0]->getLabel();
368 //prevent memory leak and get next set
369 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
370 lookup = input.getSharedRAbundVectors();
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 //output error messages about any remaining user labels
376 set<string>::iterator it;
377 bool needToRun = false;
378 for (it = userLabels.begin(); it != userLabels.end(); it++) {
379 m->mothurOut("Your file does not include the label " + *it);
380 if (processedLabels.count(lastLabel) != 1) {
381 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
384 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
388 //run last label if you need to
389 if (needToRun == true) {
390 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
391 lookup = input.getSharedRAbundVectors(lastLabel);
393 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
396 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
399 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
401 //set sabund file as new current sabundfile
403 itTypes = outputTypes.find("biom");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
409 m->mothurOutEndLine();
410 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
412 m->mothurOutEndLine();
416 catch(exception& e) {
417 m->errorOut(e, "MakeBiomCommand", "execute");
421 //**********************************************************************************************************************
422 int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
424 map<string, string> variables;
425 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
426 variables["[distance]"] = lookup[0]->getLabel();
427 string outputFileName = getOutputFileName("biom",variables);
429 m->openOutputFile(outputFileName, out);
430 outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
432 string mothurString = "mothur" + toString(m->getVersion());
434 struct tm * timeinfo;
436 timeinfo = localtime ( &rawtime );
437 string dateString = asctime (timeinfo);
438 int pos = dateString.find('\n');
439 if (pos != string::npos) { dateString = dateString.substr(0, pos);}
443 out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
444 out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
447 vector<string> metadata = getMetaData(lookup);
448 int numBins = lookup[0]->getNumBins();
450 if (m->control_pressed) { out.close(); return 0; }
454 {"id":"GG_OTU_1", "metadata":null},
455 {"id":"GG_OTU_2", "metadata":null},
456 {"id":"GG_OTU_3", "metadata":null},
457 {"id":"GG_OTU_4", "metadata":null},
458 {"id":"GG_OTU_5", "metadata":null}
460 out << spaces + "\"rows\":[\n";
461 string rowFront = spaces + spaces + "{\"id\":\"";
462 string rowBack = "\", \"metadata\":";
463 for (int i = 0; i < numBins-1; i++) {
464 if (m->control_pressed) { out.close(); return 0; }
465 out << rowFront << m->currentSharedBinLabels[i] << rowBack << metadata[i] << "},\n";
467 out << rowFront << m->currentSharedBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n";
471 {"id":"Sample1", "metadata":null},
472 {"id":"Sample2", "metadata":null},
473 {"id":"Sample3", "metadata":null},
474 {"id":"Sample4", "metadata":null},
475 {"id":"Sample5", "metadata":null},
476 {"id":"Sample6", "metadata":null}
479 string colBack = "\", \"metadata\":";
480 out << spaces + "\"columns\":[\n";
481 for (int i = 0; i < lookup.size()-1; i++) {
482 if (m->control_pressed) { out.close(); return 0; }
483 out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n";
485 out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n";
487 out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
488 out << spaces + "\"shape\": [" << numBins << "," << lookup.size() << "],\n";
489 out << spaces + "\"data\": [";
491 vector<string> dataRows;
492 if (format == "sparse") {
510 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
512 if (m->control_pressed) { out.close(); return 0; }
514 for (int j = 0; j < lookup.size(); j++) {
515 string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
516 //only print non zero values
517 if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
522 /* "matrix_type": "dense",
523 "matrix_element_type": "int",
525 "data": [[0,0,1,0,0,0],
531 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
533 if (m->control_pressed) { out.close(); return 0; }
535 string binInfo = "[";
536 for (int j = 0; j < lookup.size()-1; j++) {
537 binInfo += toString(lookup[j]->getAbundance(i)) + ",";
539 binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
540 dataRows.push_back(binInfo);
544 for (int i = 0; i < dataRows.size()-1; i++) {
545 out << dataRows[i] << ",\n" + spaces + spaces;
547 out << dataRows[dataRows.size()-1] << "]\n";
554 catch(exception& e) {
555 m->errorOut(e, "MakeBiomCommand", "getBiom");
559 //**********************************************************************************************************************
560 vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
562 vector<string> metadata;
564 if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) { metadata.push_back("null"); } }
567 //read constaxonomy file storing in a map, otulabel -> taxonomy
568 //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
570 m->openInputFile(contaxonomyfile, in);
573 m->getline(in); m->gobble(in);
575 string otuLabel, tax;
577 vector<string> otuLabels;
581 if (m->control_pressed) { in.close(); return metadata; }
583 in >> otuLabel >> size >> tax; m->gobble(in);
585 otuLabels.push_back(otuLabel);
590 //should the labels be Otu001 or PhyloType001
591 string firstBin = m->currentSharedBinLabels[0];
592 string binTag = "Otu";
593 if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType"; }
595 //convert list file bin labels to shared file bin labels
598 map<string, string> labelTaxMap;
599 string snumBins = toString(otuLabels.size());
600 for (int i = 0; i < otuLabels.size(); i++) {
602 if (m->control_pressed) { return metadata; }
604 //if there is a bin label use it otherwise make one
605 if (m->isContainingOnlyDigits(otuLabels[i])) {
606 string binLabel = binTag;
607 string sbinNumber = otuLabels[i];
608 if (sbinNumber.length() < snumBins.length()) {
609 int diff = snumBins.length() - sbinNumber.length();
610 for (int h = 0; h < diff; h++) { binLabel += "0"; }
612 binLabel += sbinNumber;
613 binLabel = m->getSimpleLabel(binLabel);
614 labelTaxMap[binLabel] = taxs[i];
615 }else { labelTaxMap[m->getSimpleLabel(otuLabels[i])] = taxs[i]; }
618 //merges OTUs classified to same gg otuid, sets otulabels to gg otuids, averages confidence scores of merged otus. overwritting of otulabels is fine because constaxonomy only allows for one label to be processed. If this assumption changes, could cause bug.
619 if (picrust) { getGreenGenesOTUIDs(lookup, labelTaxMap); }
621 //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
623 //traverse the binLabels forming the metadata strings and saving them
624 //make sure to sanity check
625 map<string, string>::iterator it;
626 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
628 if (m->control_pressed) { return metadata; }
630 it = labelTaxMap.find(m->getSimpleLabel(m->currentSharedBinLabels[i]));
632 if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentSharedBinLabels[i] + ".\n"); m->control_pressed = true; }
634 vector<string> bootstrapValues;
635 string data = "{\"taxonomy\":[";
637 vector<string> scores;
638 vector<string> taxonomies = parseTax(it->second, scores);
640 for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
641 data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
643 //add bootstrap values if available
644 if (scores[0] != "null") {
645 data += ", \"bootstrap\":[";
647 for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
648 data += scores[scores.size()-1] + "]";
653 metadata.push_back(data);
661 catch(exception& e) {
662 m->errorOut(e, "MakeBiomCommand", "getMetadata");
667 //**********************************************************************************************************************
668 int MakeBiomCommand::getGreenGenesOTUIDs(vector<SharedRAbundVector*>& lookup, map<string, string>& labelTaxMap){
671 PhyloTree phylo(referenceTax);
674 map<string, string> otuMap = readGGOtuMap(); //maps reference ID -> OTU ID
676 if (m->control_pressed) { return 0; }
678 map<string, vector<string> > ggOTUIDs;
679 //loop through otu taxonomies
680 for (map<string, string>::iterator it = labelTaxMap.begin(); it != labelTaxMap.end(); it++) { //maps label -> consensus taxonomy
681 if (m->control_pressed) { break; }
683 //get list of reference ids that map to this taxonomy
684 vector<string> referenceIds = phylo.getSeqs(it->second);
686 if (m->control_pressed) { break; }
688 //look for each one in otu map to find match
689 string otuID = "not found";
690 string referenceString = "";
691 for (int i = 0; i < referenceIds.size(); i++) {
692 referenceString += referenceIds[i] + " ";
693 map<string, string>::iterator itMap = otuMap.find(referenceIds[i]);
694 if (itMap != otuMap.end()) { //found it
695 otuID = itMap->second;
696 i += referenceIds.size(); //stop looking
700 //if found, add otu to ggOTUID list
701 if (otuID != "not found") {
702 map<string, vector<string> >::iterator itGG = ggOTUIDs.find(otuID);
703 if (itGG == ggOTUIDs.end()) {
704 vector<string> temp; temp.push_back(it->first); //save mothur OTU label
705 ggOTUIDs[otuID] = temp;
706 }else { ggOTUIDs[otuID].push_back(it->first); } //add mothur OTU label to list
707 }else { m->mothurOut("[ERROR]: could not find OTUId for " + it->second + ". Its reference sequences are " + referenceString + ".\n"); m->control_pressed = true; }
712 vector<SharedRAbundVector*> newLookup;
713 for (int i = 0; i < lookup.size(); i++) {
714 SharedRAbundVector* temp = new SharedRAbundVector();
715 temp->setLabel(lookup[i]->getLabel());
716 temp->setGroup(lookup[i]->getGroup());
717 newLookup.push_back(temp);
720 map<string, int> labelIndex;
721 for (int i = 0; i < m->currentSharedBinLabels.size(); i++) { labelIndex[m->getSimpleLabel(m->currentSharedBinLabels[i])] = i; }
723 vector<string> newBinLabels;
724 map<string, string> newLabelTaxMap;
725 //loop through ggOTUID list combining mothur otus and adjusting labels
726 //ggOTUIDs = 16097 -> <OTU01, OTU10, OTU22>
727 for (map<string, vector<string> >::iterator itMap = ggOTUIDs.begin(); itMap != ggOTUIDs.end(); itMap++) {
728 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
730 //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
731 //find taxonomy of this otu
732 map<string, string>::iterator it = labelTaxMap.find(m->getSimpleLabel(itMap->second[0]));
733 vector<string> scores;
734 vector<string> taxonomies = parseTax(it->second, scores);
736 //merge/set OTU abundances
737 vector<int> abunds; abunds.resize(lookup.size(), 0);
738 string mergeString = "";
739 vector<float> boots; boots.resize(scores.size(), 0);
740 for (int j = 0; j < itMap->second.size(); j++) { //<OTU01, OTU10, OTU22>
741 //merge bootstrap scores
742 vector<string> scores;
743 vector<string> taxonomies = parseTax(it->second, scores);
744 for (int i = 0; i < boots.size(); i++) {
745 float tempScore; m->mothurConvert(scores[i], tempScore);
746 boots[i] += tempScore;
750 mergeString += (itMap->second)[j] + " ";
751 for (int i = 0; i < lookup.size(); i++) {
752 abunds[i] += lookup[i]->getAbundance(labelIndex[m->getSimpleLabel((itMap->second)[j])]);
756 if (m->debug) { m->mothurOut("[DEBUG]: merging " + mergeString + " for ggOTUid = " + itMap->first + ".\n"); }
759 //add merged otu to new lookup
760 for (int j = 0; j < boots.size(); j++) { boots[j] /= (float) itMap->second.size(); }
762 //assemble new taxomoy
763 string newTaxString = "";
764 for (int j = 0; j < boots.size(); j++) {
765 newTaxString += taxonomies[j] + "(" + toString(boots[j]) + ");";
768 //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
769 //find taxonomy of this otu
770 newLabelTaxMap[itMap->first] = newTaxString;
772 //add merged otu to new lookup
773 for (int j = 0; j < abunds.size(); j++) { newLookup[j]->push_back(abunds[j], newLookup[j]->getGroup()); }
776 newBinLabels.push_back(itMap->first);
779 for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; }
782 m->currentSharedBinLabels = newBinLabels;
783 labelTaxMap = newLabelTaxMap;
785 map<string, string> variables;
786 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
787 variables["[distance]"] = lookup[0]->getLabel();
788 string outputFileName = getOutputFileName("shared",variables);
790 m->openOutputFile(outputFileName, out);
791 outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
793 lookup[0]->printHeaders(out);
795 for (int i = 0; i < lookup.size(); i++) {
796 out << lookup[i]->getLabel() << '\t' << lookup[i]->getGroup() << '\t';
797 lookup[i]->print(out);
803 catch(exception& e) {
804 m->errorOut(e, "MakeBiomCommand", "getGreenGenesOTUIDs");
809 //**********************************************************************************************************************
810 map<string, string> MakeBiomCommand::readGGOtuMap(){
812 map<string, string> otuMap;
815 m->openInputFile(picrustOtuFile, in);
817 //map referenceIDs -> otuIDs
819 //16097 671376 616121 533566 683683 4332909 4434717 772666 611808 695209
821 if (m->control_pressed) { break; }
823 string line = m->getline(in); m->gobble(in);
824 vector<string> pieces = m->splitWhiteSpace(line);
826 if (pieces.size() != 0) {
827 string otuID = pieces[1];
828 for (int i = 1; i < pieces.size(); i++) { otuMap[pieces[i]] = otuID; }
835 catch(exception& e) {
836 m->errorOut(e, "MakeBiomCommand", "readGGOtuMap");
841 //**********************************************************************************************************************
842 int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
844 sampleMetadata.clear();
845 if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } }
848 m->openInputFile(metadatafile, in);
850 vector<string> groupNames, metadataLabels;
851 map<string, vector<string> > lines;
853 string headerLine = m->getline(in); m->gobble(in);
854 vector<string> pieces = m->splitWhiteSpace(headerLine);
856 //save names of columns you are reading
857 for (int i = 1; i < pieces.size(); i++) {
858 metadataLabels.push_back(pieces[i]);
860 int count = metadataLabels.size();
862 vector<string> groups = m->getGroups();
867 if (m->control_pressed) { in.close(); return 0; }
870 in >> group; m->gobble(in);
871 groupNames.push_back(group);
873 string line = m->getline(in); m->gobble(in);
874 vector<string> thisPieces = m->splitWhiteSpaceWithQuotes(line);
876 if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); }
877 else { if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } }
883 map<string, vector<string> >::iterator it;
884 for (int i = 0; i < lookup.size(); i++) {
886 if (m->control_pressed) { return 0; }
888 it = lines.find(lookup[i]->getGroup());
890 if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; }
892 vector<string> values = it->second;
895 for (int j = 0; j < metadataLabels.size()-1; j++) {
896 values[j] = m->removeQuotes(values[j]);
897 data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", ";
899 values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]);
900 data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}";
901 sampleMetadata.push_back(data);
909 catch(exception& e) {
910 m->errorOut(e, "MakeBiomCommand", "getSampleMetaData");
916 /**************************************************************************************************/
917 //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
918 vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
924 while (tax.find_first_of(';') != -1) {
926 if (m->control_pressed) { return taxs; }
929 taxon = tax.substr(0,tax.find_first_of(';'));
931 int pos = taxon.find_last_of('(');
934 int pos2 = taxon.find_last_of(')');
936 string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
937 if (m->isNumeric1(confidenceScore)) {
938 taxon = taxon.substr(0, pos); //rip off confidence
939 scores.push_back(confidenceScore);
940 }else{ scores.push_back("null"); }
942 }else{ scores.push_back("null"); }
944 //strip "" if they are there
945 pos = taxon.find("\"");
946 if (pos != string::npos) {
948 for (int k = 0; k < taxon.length(); k++) {
949 if (taxon[k] != '\"') { newTax += taxon[k]; }
954 //look for bootstrap value
955 taxs.push_back(taxon);
956 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
961 catch(exception& e) {
962 m->errorOut(e, "MakeBiomCommand", "parseTax");
967 //**********************************************************************************************************************