5 // Created by Sarah Westcott on 4/16/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "makebiomcommand.h"
10 #include "sharedrabundvector.h"
11 #include "inputdata.h"
12 #include "sharedutilities.h"
13 #include "phylotree.h"
15 //taken from http://biom-format.org/documentation/biom_format.html
19 "format": "Biological Observation Matrix 0.9.1",
20 "format_url": "http://biom-format.org",
22 "generated_by": "QIIME revision 1.4.0-dev",
23 "date": "2011-12-19T19:00:00",
25 {"id":"GG_OTU_1", "metadata":null},
26 {"id":"GG_OTU_2", "metadata":null},
27 {"id":"GG_OTU_3", "metadata":null},
28 {"id":"GG_OTU_4", "metadata":null},
29 {"id":"GG_OTU_5", "metadata":null}
32 {"id":"Sample1", "metadata":null},
33 {"id":"Sample2", "metadata":null},
34 {"id":"Sample3", "metadata":null},
35 {"id":"Sample4", "metadata":null},
36 {"id":"Sample5", "metadata":null},
37 {"id":"Sample6", "metadata":null}
39 "matrix_type": "sparse",
40 "matrix_element_type": "int",
63 "format": "Biological Observation Matrix 0.9.1",
64 "format_url": "http://biom-format.org",
66 "generated_by": "QIIME revision 1.4.0-dev",
67 "date": "2011-12-19T19:00:00",
69 {"id":"GG_OTU_1", "metadata":null},
70 {"id":"GG_OTU_2", "metadata":null},
71 {"id":"GG_OTU_3", "metadata":null},
72 {"id":"GG_OTU_4", "metadata":null},
73 {"id":"GG_OTU_5", "metadata":null}
76 {"id":"Sample1", "metadata":null},
77 {"id":"Sample2", "metadata":null},
78 {"id":"Sample3", "metadata":null},
79 {"id":"Sample4", "metadata":null},
80 {"id":"Sample5", "metadata":null},
81 {"id":"Sample6", "metadata":null}
83 "matrix_type": "dense",
84 "matrix_element_type": "int",
86 "data": [[0,0,1,0,0,0],
93 //**********************************************************************************************************************
94 vector<string> MakeBiomCommand::setParameters(){
96 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","biom",false,true,true); parameters.push_back(pshared);
97 CommandParameter pcontaxonomy("constaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pcontaxonomy);
98 CommandParameter preference("reftaxonomy", "InputTypes", "", "", "none", "none", "refPi","",false,false); parameters.push_back(preference);
99 CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pmetadata);
100 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
101 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
102 CommandParameter ppicrust("picrust", "InputTypes", "", "", "none", "none", "refPi","shared",false,false); parameters.push_back(ppicrust);
103 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
104 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
105 CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "","",false,false); parameters.push_back(pmatrixtype);
107 vector<string> myArray;
108 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
111 catch(exception& e) {
112 m->errorOut(e, "MakeBiomCommand", "setParameters");
116 //**********************************************************************************************************************
117 string MakeBiomCommand::getHelpString(){
119 string helpString = "";
120 helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype, picrust, reftaxonomy and label. shared is required, unless you have a valid current file.\n"; //
121 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
122 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
123 helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
124 helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance. ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n";
125 helpString += "The metadata parameter is used to provide experimental parameters to the columns. Things like 'sample1 gut human_gut'. \n";
126 helpString += "The picrust parameter is used to provide the greengenes OTU IDs map table. NOTE: Picrust requires a greengenes taxonomy. \n";
127 helpString += "The referencetax parameter is used with the picrust parameter. Picrust requires the greengenes OTU IDs to be in the biom file. \n";
128 helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
129 helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
130 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
131 helpString += "The make.biom command outputs a .biom file.\n";
132 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
135 catch(exception& e) {
136 m->errorOut(e, "MakeBiomCommand", "getHelpString");
140 //**********************************************************************************************************************
141 string MakeBiomCommand::getOutputPattern(string type) {
145 if (type == "biom") { pattern = "[filename],[distance],biom"; }
146 else if (type == "shared") { pattern = "[filename],[distance],biom_shared"; }
147 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
151 catch(exception& e) {
152 m->errorOut(e, "MakeBiomCommand", "getOutputPattern");
157 //**********************************************************************************************************************
158 MakeBiomCommand::MakeBiomCommand(){
160 abort = true; calledHelp = true;
162 vector<string> tempOutNames;
163 outputTypes["biom"] = tempOutNames;
164 outputTypes["shared"] = tempOutNames;
166 catch(exception& e) {
167 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
171 //**********************************************************************************************************************
172 MakeBiomCommand::MakeBiomCommand(string option) {
174 abort = false; calledHelp = false;
177 //allow user to run help
178 if(option == "help") { help(); abort = true; calledHelp = true; }
179 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
182 vector<string> myArray = setParameters();
184 OptionParser parser(option);
185 map<string,string> parameters = parser.getParameters();
186 map<string,string>::iterator it;
188 ValidParameters validParameter;
190 //check to make sure all parameters are valid for command
191 for (it = parameters.begin(); it != parameters.end(); it++) {
192 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
195 //initialize outputTypes
196 vector<string> tempOutNames;
197 outputTypes["biom"] = tempOutNames;
198 outputTypes["shared"] = tempOutNames;
200 //if the user changes the input directory command factory will send this info to us in the output parameter
201 string inputDir = validParameter.validFile(parameters, "inputdir", false);
202 if (inputDir == "not found"){ inputDir = ""; }
205 it = parameters.find("shared");
206 //user has given a template file
207 if(it != parameters.end()){
208 path = m->hasPath(it->second);
209 //if the user has not given a path then, add inputdir. else leave path alone.
210 if (path == "") { parameters["shared"] = inputDir + it->second; }
213 it = parameters.find("constaxonomy");
214 //user has given a template file
215 if(it != parameters.end()){
216 path = m->hasPath(it->second);
217 //if the user has not given a path then, add inputdir. else leave path alone.
218 if (path == "") { parameters["constaxonomy"] = inputDir + it->second; }
221 it = parameters.find("reftaxonomy");
222 //user has given a template file
223 if(it != parameters.end()){
224 path = m->hasPath(it->second);
225 //if the user has not given a path then, add inputdir. else leave path alone.
226 if (path == "") { parameters["reftaxonomy"] = inputDir + it->second; }
229 it = parameters.find("picrust");
230 //user has given a template file
231 if(it != parameters.end()){
232 path = m->hasPath(it->second);
233 //if the user has not given a path then, add inputdir. else leave path alone.
234 if (path == "") { parameters["picrust"] = inputDir + it->second; }
237 it = parameters.find("metadata");
238 //user has given a template file
239 if(it != parameters.end()){
240 path = m->hasPath(it->second);
241 //if the user has not given a path then, add inputdir. else leave path alone.
242 if (path == "") { parameters["metadata"] = inputDir + it->second; }
247 sharedfile = validParameter.validFile(parameters, "shared", true);
248 if (sharedfile == "not open") { sharedfile = ""; abort = true; }
249 else if (sharedfile == "not found") {
250 //if there is a current shared file, use it
251 sharedfile = m->getSharedFile();
252 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
253 else { m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
254 }else { m->setSharedFile(sharedfile); }
257 //if the user changes the output directory command factory will send this info to us in the output parameter
258 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(sharedfile); }
260 contaxonomyfile = validParameter.validFile(parameters, "constaxonomy", true);
261 if (contaxonomyfile == "not found") { contaxonomyfile = ""; }
262 else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
264 referenceTax = validParameter.validFile(parameters, "reftaxonomy", true);
265 if (referenceTax == "not found") { referenceTax = ""; }
266 else if (referenceTax == "not open") { referenceTax = ""; abort = true; }
268 picrustOtuFile = validParameter.validFile(parameters, "picrust", true);
269 if (picrustOtuFile == "not found") { picrustOtuFile = ""; }
270 else if (picrustOtuFile == "not open") { picrustOtuFile = ""; abort = true; }
272 metadatafile = validParameter.validFile(parameters, "metadata", true);
273 if (metadatafile == "not found") { metadatafile = ""; }
274 else if (metadatafile == "not open") { metadatafile = ""; abort = true; }
276 //check for optional parameter and set defaults
277 // ...at some point should added some additional type checking...
278 label = validParameter.validFile(parameters, "label", false);
279 if (label == "not found") { label = ""; }
281 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
282 else { allLines = 1; }
285 groups = validParameter.validFile(parameters, "groups", false);
286 if (groups == "not found") { groups = ""; }
288 m->splitAtDash(groups, Groups);
289 m->setGroups(Groups);
292 if (picrustOtuFile != "") {
294 if (contaxonomyfile == "") { m->mothurOut("[ERROR]: the constaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true; }
295 if (referenceTax == "") { m->mothurOut("[ERROR]: the reftaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true; }
296 }else { picrust=false; }
298 if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
300 format = validParameter.validFile(parameters, "matrixtype", false); if (format == "not found") { format = "sparse"; }
302 if ((format != "sparse") && (format != "dense")) {
303 m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true;
308 catch(exception& e) {
309 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
313 //**********************************************************************************************************************
315 int MakeBiomCommand::execute(){
318 if (abort == true) { if (calledHelp) { return 0; } return 2; }
320 InputData input(sharedfile, "sharedfile");
321 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
322 string lastLabel = lookup[0]->getLabel();
324 //if user did not specify a label, then use first one
325 if ((contaxonomyfile != "") && (labels.size() == 0)) {
327 labels.insert(lastLabel);
330 getSampleMetaData(lookup);
332 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333 set<string> processedLabels;
334 set<string> userLabels = labels;
336 //as long as you are not at the end of the file or done wih the lines you want
337 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
339 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; } return 0; }
341 if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){
343 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
346 processedLabels.insert(lookup[0]->getLabel());
347 userLabels.erase(lookup[0]->getLabel());
350 if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
351 string saveLabel = lookup[0]->getLabel();
353 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
354 lookup = input.getSharedRAbundVectors(lastLabel);
355 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
359 processedLabels.insert(lookup[0]->getLabel());
360 userLabels.erase(lookup[0]->getLabel());
362 //restore real lastlabel to save below
363 lookup[0]->setLabel(saveLabel);
366 lastLabel = lookup[0]->getLabel();
368 //prevent memory leak and get next set
369 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; lookup[i] = NULL; }
370 lookup = input.getSharedRAbundVectors();
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 //output error messages about any remaining user labels
376 set<string>::iterator it;
377 bool needToRun = false;
378 for (it = userLabels.begin(); it != userLabels.end(); it++) {
379 m->mothurOut("Your file does not include the label " + *it);
380 if (processedLabels.count(lastLabel) != 1) {
381 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
384 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
388 //run last label if you need to
389 if (needToRun == true) {
390 for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }
391 lookup = input.getSharedRAbundVectors(lastLabel);
393 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
396 for (int i = 0; i < lookup.size(); i++) { delete lookup[i]; }
399 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
401 //set sabund file as new current sabundfile
403 itTypes = outputTypes.find("biom");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
409 m->mothurOutEndLine();
410 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
412 m->mothurOutEndLine();
416 catch(exception& e) {
417 m->errorOut(e, "MakeBiomCommand", "execute");
421 //**********************************************************************************************************************
422 int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
424 map<string, string> variables;
425 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
426 variables["[distance]"] = lookup[0]->getLabel();
427 string outputFileName = getOutputFileName("biom",variables);
429 m->openOutputFile(outputFileName, out);
430 outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
432 string mothurString = "mothur" + toString(m->getVersion());
434 struct tm * timeinfo;
436 timeinfo = localtime ( &rawtime );
437 string dateString = asctime (timeinfo);
438 int pos = dateString.find('\n');
439 if (pos != string::npos) { dateString = dateString.substr(0, pos);}
443 out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
444 out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
447 vector<string> metadata = getMetaData(lookup);
448 int numBins = lookup[0]->getNumBins();
450 if (m->control_pressed) { out.close(); return 0; }
454 {"id":"GG_OTU_1", "metadata":null},
455 {"id":"GG_OTU_2", "metadata":null},
456 {"id":"GG_OTU_3", "metadata":null},
457 {"id":"GG_OTU_4", "metadata":null},
458 {"id":"GG_OTU_5", "metadata":null}
460 out << spaces + "\"rows\":[\n";
461 string rowFront = spaces + spaces + "{\"id\":\"";
462 string rowBack = "\", \"metadata\":";
463 for (int i = 0; i < numBins-1; i++) {
464 if (m->control_pressed) { out.close(); return 0; }
465 out << rowFront << m->currentSharedBinLabels[i] << rowBack << metadata[i] << "},\n";
467 out << rowFront << m->currentSharedBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n";
471 {"id":"Sample1", "metadata":null},
472 {"id":"Sample2", "metadata":null},
473 {"id":"Sample3", "metadata":null},
474 {"id":"Sample4", "metadata":null},
475 {"id":"Sample5", "metadata":null},
476 {"id":"Sample6", "metadata":null}
479 string colBack = "\", \"metadata\":";
480 out << spaces + "\"columns\":[\n";
481 for (int i = 0; i < lookup.size()-1; i++) {
482 if (m->control_pressed) { out.close(); return 0; }
483 out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n";
485 out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n";
487 out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
488 out << spaces + "\"shape\": [" << numBins << "," << lookup.size() << "],\n";
489 out << spaces + "\"data\": [";
491 vector<string> dataRows;
492 if (format == "sparse") {
510 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
512 if (m->control_pressed) { out.close(); return 0; }
514 for (int j = 0; j < lookup.size(); j++) {
515 string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
516 //only print non zero values
517 if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
522 /* "matrix_type": "dense",
523 "matrix_element_type": "int",
525 "data": [[0,0,1,0,0,0],
531 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
533 if (m->control_pressed) { out.close(); return 0; }
535 string binInfo = "[";
536 for (int j = 0; j < lookup.size()-1; j++) {
537 binInfo += toString(lookup[j]->getAbundance(i)) + ",";
539 binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
540 dataRows.push_back(binInfo);
544 for (int i = 0; i < dataRows.size()-1; i++) {
545 out << dataRows[i] << ",\n" + spaces + spaces;
547 out << dataRows[dataRows.size()-1] << "]\n";
554 catch(exception& e) {
555 m->errorOut(e, "MakeBiomCommand", "getBiom");
559 //**********************************************************************************************************************
560 vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
562 vector<string> metadata;
564 if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) { metadata.push_back("null"); } }
567 //read constaxonomy file storing in a map, otulabel -> taxonomy
568 //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
570 m->openInputFile(contaxonomyfile, in);
573 m->getline(in); m->gobble(in);
575 string otuLabel, tax;
577 vector<string> otuLabels;
581 if (m->control_pressed) { in.close(); return metadata; }
583 in >> otuLabel >> size >> tax; m->gobble(in);
585 otuLabels.push_back(otuLabel);
590 //should the labels be Otu001 or PhyloType001
591 string firstBin = m->currentSharedBinLabels[0];
592 string binTag = "Otu";
593 if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType"; }
595 //convert list file bin labels to shared file bin labels
598 map<string, string> labelTaxMap;
599 string snumBins = toString(otuLabels.size());
600 for (int i = 0; i < otuLabels.size(); i++) {
602 if (m->control_pressed) { return metadata; }
604 //if there is a bin label use it otherwise make one
605 if (m->isContainingOnlyDigits(otuLabels[i])) {
606 string binLabel = binTag;
607 string sbinNumber = otuLabels[i];
608 if (sbinNumber.length() < snumBins.length()) {
609 int diff = snumBins.length() - sbinNumber.length();
610 for (int h = 0; h < diff; h++) { binLabel += "0"; }
612 binLabel += sbinNumber;
613 binLabel = m->getSimpleLabel(binLabel);
614 labelTaxMap[binLabel] = taxs[i];
615 }else { labelTaxMap[m->getSimpleLabel(otuLabels[i])] = taxs[i]; }
618 //merges OTUs classified to same gg otuid, sets otulabels to gg otuids, averages confidence scores of merged otus. overwritting of otulabels is fine because constaxonomy only allows for one label to be processed. If this assumption changes, could cause bug.
619 if (picrust) { getGreenGenesOTUIDs(lookup, labelTaxMap); }
621 //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
623 //traverse the binLabels forming the metadata strings and saving them
624 //make sure to sanity check
625 map<string, string>::iterator it;
626 for (int i = 0; i < lookup[0]->getNumBins(); i++) {
628 if (m->control_pressed) { return metadata; }
630 it = labelTaxMap.find(m->getSimpleLabel(m->currentSharedBinLabels[i]));
632 if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentSharedBinLabels[i] + ".\n"); m->control_pressed = true; }
634 vector<string> bootstrapValues;
635 string data = "{\"taxonomy\":[";
637 vector<string> scores;
638 vector<string> taxonomies = parseTax(it->second, scores);
640 for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
641 data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
643 //add bootstrap values if available
644 if (scores[0] != "null") {
645 data += ", \"bootstrap\":[";
647 for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
648 data += scores[scores.size()-1] + "]";
653 metadata.push_back(data);
661 catch(exception& e) {
662 m->errorOut(e, "MakeBiomCommand", "getMetadata");
667 //**********************************************************************************************************************
668 int MakeBiomCommand::getGreenGenesOTUIDs(vector<SharedRAbundVector*>& lookup, map<string, string>& labelTaxMap){
671 PhyloTree phylo(referenceTax);
674 map<string, string> otuMap = readGGOtuMap(); //maps reference ID -> OTU ID
676 if (m->control_pressed) { return 0; }
678 map<string, vector<string> > ggOTUIDs;
679 //loop through otu taxonomies
680 for (map<string, string>::iterator it = labelTaxMap.begin(); it != labelTaxMap.end(); it++) { //maps label -> consensus taxonomy
681 if (m->control_pressed) { break; }
683 string OTUTaxonomy = it->second;
686 m->removeConfidences(OTUTaxonomy);
688 //remove unclassifieds to match template
689 int thisPos = OTUTaxonomy.find("unclassified;");
690 if (thisPos != string::npos) { OTUTaxonomy = OTUTaxonomy.substr(0, thisPos); }
692 //get list of reference ids that map to this taxonomy
693 vector<string> referenceIds = phylo.getSeqs(OTUTaxonomy);
695 if (m->control_pressed) { break; }
697 //look for each one in otu map to find match
698 string otuID = "not found";
699 string referenceString = "";
700 for (int i = 0; i < referenceIds.size(); i++) {
701 referenceString += referenceIds[i] + " ";
702 map<string, string>::iterator itMap = otuMap.find(referenceIds[i]);
703 if (itMap != otuMap.end()) { //found it
704 otuID = itMap->second;
705 i += referenceIds.size(); //stop looking
709 //if found, add otu to ggOTUID list
710 if (otuID != "not found") {
711 map<string, vector<string> >::iterator itGG = ggOTUIDs.find(otuID);
712 if (itGG == ggOTUIDs.end()) {
713 vector<string> temp; temp.push_back(it->first); //save mothur OTU label
714 ggOTUIDs[otuID] = temp;
715 }else { ggOTUIDs[otuID].push_back(it->first); } //add mothur OTU label to list
716 }else { m->mothurOut("[ERROR]: could not find OTUId for " + it->second + ". Its reference sequences are " + referenceString + ".\n"); m->control_pressed = true; }
721 vector<SharedRAbundVector*> newLookup;
722 for (int i = 0; i < lookup.size(); i++) {
723 SharedRAbundVector* temp = new SharedRAbundVector();
724 temp->setLabel(lookup[i]->getLabel());
725 temp->setGroup(lookup[i]->getGroup());
726 newLookup.push_back(temp);
729 map<string, int> labelIndex;
730 for (int i = 0; i < m->currentSharedBinLabels.size(); i++) { labelIndex[m->getSimpleLabel(m->currentSharedBinLabels[i])] = i; }
732 vector<string> newBinLabels;
733 map<string, string> newLabelTaxMap;
734 //loop through ggOTUID list combining mothur otus and adjusting labels
735 //ggOTUIDs = 16097 -> <OTU01, OTU10, OTU22>
737 for (map<string, vector<string> >::iterator itMap = ggOTUIDs.begin(); itMap != ggOTUIDs.end(); itMap++) {
738 if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) { delete newLookup[j]; } return 0; }
740 //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
741 //find taxonomy of this otu
742 map<string, string>::iterator it = labelTaxMap.find(m->getSimpleLabel(itMap->second[0]));
743 vector<string> scores;
744 vector<string> taxonomies = parseTax(it->second, scores);
746 //merge/set OTU abundances
747 vector<int> abunds; abunds.resize(lookup.size(), 0);
748 string mergeString = "";
749 vector<float> boots; boots.resize(scores.size(), 0);
750 bool scoresNULL = false;
751 for (int j = 0; j < itMap->second.size(); j++) { //<OTU01, OTU10, OTU22>
753 if (scores[0] != "null") {
754 //merge bootstrap scores
755 vector<string> scores;
756 vector<string> taxonomies = parseTax(it->second, scores);
757 for (int i = 0; i < boots.size(); i++) {
758 float tempScore; m->mothurConvert(scores[i], tempScore);
759 boots[i] += tempScore;
761 }else { scoresNULL = true; }
764 mergeString += (itMap->second)[j] + " ";
765 for (int i = 0; i < lookup.size(); i++) {
766 abunds[i] += lookup[i]->getAbundance(labelIndex[m->getSimpleLabel((itMap->second)[j])]);
770 if (m->debug) { m->mothurOut("[DEBUG]: merging " + mergeString + " for ggOTUid = " + itMap->first + ".\n"); }
773 //add merged otu to new lookup
774 string newTaxString = "";
776 for (int j = 0; j < boots.size(); j++) { boots[j] /= (float) itMap->second.size(); }
778 //assemble new taxomoy
779 for (int j = 0; j < boots.size(); j++) {
780 newTaxString += taxonomies[j] + "(" + toString(boots[j]) + ");";
783 //assemble new taxomoy
784 for (int j = 0; j < taxonomies.size(); j++) {
785 newTaxString += taxonomies[j] + ";";
789 //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
790 //find taxonomy of this otu
791 newLabelTaxMap[itMap->first] = newTaxString;
793 //add merged otu to new lookup
794 for (int j = 0; j < abunds.size(); j++) { newLookup[j]->push_back(abunds[j], newLookup[j]->getGroup()); }
797 newBinLabels.push_back(itMap->first);
800 for (int j = 0; j < lookup.size(); j++) { delete lookup[j]; }
803 m->currentSharedBinLabels = newBinLabels;
804 labelTaxMap = newLabelTaxMap;
806 map<string, string> variables;
807 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
808 variables["[distance]"] = lookup[0]->getLabel();
809 string outputFileName = getOutputFileName("shared",variables);
811 m->openOutputFile(outputFileName, out);
812 outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
814 lookup[0]->printHeaders(out);
816 for (int i = 0; i < lookup.size(); i++) {
817 out << lookup[i]->getLabel() << '\t' << lookup[i]->getGroup() << '\t';
818 lookup[i]->print(out);
824 catch(exception& e) {
825 m->errorOut(e, "MakeBiomCommand", "getGreenGenesOTUIDs");
830 //**********************************************************************************************************************
831 map<string, string> MakeBiomCommand::readGGOtuMap(){
833 map<string, string> otuMap;
836 m->openInputFile(picrustOtuFile, in);
838 //map referenceIDs -> otuIDs
840 //16097 671376 616121 533566 683683 4332909 4434717 772666 611808 695209
842 if (m->control_pressed) { break; }
844 string line = m->getline(in); m->gobble(in);
845 vector<string> pieces = m->splitWhiteSpace(line);
847 if (pieces.size() != 0) {
848 string otuID = pieces[1];
849 for (int i = 1; i < pieces.size(); i++) { otuMap[pieces[i]] = otuID; }
856 catch(exception& e) {
857 m->errorOut(e, "MakeBiomCommand", "readGGOtuMap");
862 //**********************************************************************************************************************
863 int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
865 sampleMetadata.clear();
866 if (metadatafile == "") { for (int i = 0; i < lookup.size(); i++) { sampleMetadata.push_back("null"); } }
869 m->openInputFile(metadatafile, in);
871 vector<string> groupNames, metadataLabels;
872 map<string, vector<string> > lines;
874 string headerLine = m->getline(in); m->gobble(in);
875 vector<string> pieces = m->splitWhiteSpace(headerLine);
877 //save names of columns you are reading
878 for (int i = 1; i < pieces.size(); i++) {
879 metadataLabels.push_back(pieces[i]);
881 int count = metadataLabels.size();
883 vector<string> groups = m->getGroups();
888 if (m->control_pressed) { in.close(); return 0; }
891 in >> group; m->gobble(in);
892 groupNames.push_back(group);
894 string line = m->getline(in); m->gobble(in);
895 vector<string> thisPieces = m->splitWhiteSpaceWithQuotes(line);
897 if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); }
898 else { if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } }
904 map<string, vector<string> >::iterator it;
905 for (int i = 0; i < lookup.size(); i++) {
907 if (m->control_pressed) { return 0; }
909 it = lines.find(lookup[i]->getGroup());
911 if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; }
913 vector<string> values = it->second;
916 for (int j = 0; j < metadataLabels.size()-1; j++) {
917 values[j] = m->removeQuotes(values[j]);
918 data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", ";
920 values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]);
921 data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}";
922 sampleMetadata.push_back(data);
930 catch(exception& e) {
931 m->errorOut(e, "MakeBiomCommand", "getSampleMetaData");
937 /**************************************************************************************************/
938 //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
939 vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
945 while (tax.find_first_of(';') != -1) {
947 if (m->control_pressed) { return taxs; }
950 taxon = tax.substr(0,tax.find_first_of(';'));
952 int pos = taxon.find_last_of('(');
955 int pos2 = taxon.find_last_of(')');
957 string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
958 if (m->isNumeric1(confidenceScore)) {
959 taxon = taxon.substr(0, pos); //rip off confidence
960 scores.push_back(confidenceScore);
961 }else{ scores.push_back("null"); }
963 }else{ scores.push_back("null"); }
965 //strip "" if they are there
966 pos = taxon.find("\"");
967 if (pos != string::npos) {
969 for (int k = 0; k < taxon.length(); k++) {
970 if (taxon[k] != '\"') { newTax += taxon[k]; }
975 //look for bootstrap value
976 taxs.push_back(taxon);
977 tax = tax.substr(tax.find_first_of(';')+1, tax.length());
982 catch(exception& e) {
983 m->errorOut(e, "MakeBiomCommand", "parseTax");
988 //**********************************************************************************************************************