]> git.donarmstrong.com Git - mothur.git/blob - makebiomcommand.cpp
added picrust and ref taxonomy parameters to make.biom
[mothur.git] / makebiomcommand.cpp
1 //
2 //  makebiomcommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/16/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "makebiomcommand.h"
10 #include "sharedrabundvector.h"
11 #include "inputdata.h"
12 #include "sharedutilities.h"
13 #include "phylotree.h"
14
15 //taken from http://biom-format.org/documentation/biom_format.html
16 /* Minimal Sparse 
17  {
18  "id":null,
19  "format": "Biological Observation Matrix 0.9.1",
20  "format_url": "http://biom-format.org",
21  "type": "OTU table",
22  "generated_by": "QIIME revision 1.4.0-dev",
23  "date": "2011-12-19T19:00:00",
24  "rows":[
25  {"id":"GG_OTU_1", "metadata":null},
26  {"id":"GG_OTU_2", "metadata":null},
27  {"id":"GG_OTU_3", "metadata":null},
28  {"id":"GG_OTU_4", "metadata":null},
29  {"id":"GG_OTU_5", "metadata":null}
30  ],
31  "columns": [
32  {"id":"Sample1", "metadata":null},
33  {"id":"Sample2", "metadata":null},
34  {"id":"Sample3", "metadata":null},
35  {"id":"Sample4", "metadata":null},
36  {"id":"Sample5", "metadata":null},
37  {"id":"Sample6", "metadata":null}
38  ],
39  "matrix_type": "sparse",
40  "matrix_element_type": "int",
41  "shape": [5, 6],
42  "data":[[0,2,1],
43  [1,0,5],
44  [1,1,1],
45  [1,3,2],
46  [1,4,3],
47  [1,5,1],
48  [2,2,1],
49  [2,3,4],
50  [2,4,2],
51  [3,0,2],
52  [3,1,1],
53  [3,2,1],
54  [3,5,1],
55  [4,1,1],
56  [4,2,1]
57  ]
58  }
59  */
60 /* Minimal dense
61  {
62  "id":null,
63  "format": "Biological Observation Matrix 0.9.1",
64  "format_url": "http://biom-format.org",
65  "type": "OTU table",
66  "generated_by": "QIIME revision 1.4.0-dev",
67  "date": "2011-12-19T19:00:00",
68  "rows":[
69  {"id":"GG_OTU_1", "metadata":null},
70  {"id":"GG_OTU_2", "metadata":null},
71  {"id":"GG_OTU_3", "metadata":null},
72  {"id":"GG_OTU_4", "metadata":null},
73  {"id":"GG_OTU_5", "metadata":null}
74  ],
75  "columns": [
76  {"id":"Sample1", "metadata":null},
77  {"id":"Sample2", "metadata":null},
78  {"id":"Sample3", "metadata":null},
79  {"id":"Sample4", "metadata":null},
80  {"id":"Sample5", "metadata":null},
81  {"id":"Sample6", "metadata":null}
82  ],
83  "matrix_type": "dense",
84  "matrix_element_type": "int",
85  "shape": [5,6],
86  "data":  [[0,0,1,0,0,0],
87  [5,1,0,2,3,1],
88  [0,0,1,4,2,0],
89  [2,1,1,0,0,1],
90  [0,1,1,0,0,0]]
91  }
92  */
93 //**********************************************************************************************************************
94 vector<string> MakeBiomCommand::setParameters(){        
95         try {
96                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","biom",false,true,true); parameters.push_back(pshared);
97         CommandParameter pcontaxonomy("constaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pcontaxonomy);
98         CommandParameter preference("reftaxonomy", "InputTypes", "", "", "none", "none", "refPi","",false,false); parameters.push_back(preference);
99         CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pmetadata);
100                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
101                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
102         CommandParameter ppicrust("picrust", "InputTypes", "", "", "none", "none", "refPi","shared",false,false); parameters.push_back(ppicrust);
103         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
104                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
105         CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "","",false,false); parameters.push_back(pmatrixtype);
106
107                 vector<string> myArray;
108                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
109                 return myArray;
110         }
111         catch(exception& e) {
112                 m->errorOut(e, "MakeBiomCommand", "setParameters");
113                 exit(1);
114         }
115 }
116 //**********************************************************************************************************************
117 string MakeBiomCommand::getHelpString(){        
118         try {
119                 string helpString = "";
120                 helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype, picrust, reftaxonomy and label.  shared is required, unless you have a valid current file.\n"; //
121                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
122                 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
123                 helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
124         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance.  ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n";
125         helpString += "The metadata parameter is used to provide experimental parameters to the columns.  Things like 'sample1 gut human_gut'. \n";
126         helpString += "The picrust parameter is used to provide the greengenes OTU IDs map table.  NOTE: Picrust requires a greengenes taxonomy. \n";
127         helpString += "The referencetax parameter is used with the picrust parameter.  Picrust requires the greengenes OTU IDs to be in the biom file. \n";
128                 helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
129                 helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
130                 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
131                 helpString += "The make.biom command outputs a .biom file.\n";
132                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
133                 return helpString;
134         }
135         catch(exception& e) {
136                 m->errorOut(e, "MakeBiomCommand", "getHelpString");
137                 exit(1);
138         }
139 }
140 //**********************************************************************************************************************
141 string MakeBiomCommand::getOutputPattern(string type) {
142     try {
143         string pattern = "";
144         
145         if (type == "biom") {  pattern = "[filename],[distance],biom"; }
146         else if (type == "shared") {  pattern = "[filename],[distance],biom_shared"; }
147         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
148         
149         return pattern;
150     }
151     catch(exception& e) {
152         m->errorOut(e, "MakeBiomCommand", "getOutputPattern");
153         exit(1);
154     }
155 }
156
157 //**********************************************************************************************************************
158 MakeBiomCommand::MakeBiomCommand(){     
159         try {
160                 abort = true; calledHelp = true; 
161                 setParameters();
162                 vector<string> tempOutNames;
163                 outputTypes["biom"] = tempOutNames;
164         outputTypes["shared"] = tempOutNames;
165         }
166         catch(exception& e) {
167                 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
168                 exit(1);
169         }
170 }
171 //**********************************************************************************************************************
172 MakeBiomCommand::MakeBiomCommand(string option) {
173         try {
174                 abort = false; calledHelp = false;   
175                 allLines = 1;
176         
177                 //allow user to run help
178                 if(option == "help") { help(); abort = true; calledHelp = true; }
179                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
180                 
181                 else {
182                         vector<string> myArray = setParameters();
183                         
184                         OptionParser parser(option);
185                         map<string,string> parameters = parser.getParameters();
186                         map<string,string>::iterator it;
187                         
188                         ValidParameters validParameter;
189                         
190                         //check to make sure all parameters are valid for command
191                         for (it = parameters.begin(); it != parameters.end(); it++) { 
192                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
193                         }
194
195                         //initialize outputTypes
196                         vector<string> tempOutNames;
197                         outputTypes["biom"] = tempOutNames;
198             outputTypes["shared"] = tempOutNames;
199                         
200                         //if the user changes the input directory command factory will send this info to us in the output parameter 
201                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
202                         if (inputDir == "not found"){   inputDir = "";          }
203                         else {
204                                 string path;
205                                 it = parameters.find("shared");
206                                 //user has given a template file
207                                 if(it != parameters.end()){ 
208                                         path = m->hasPath(it->second);
209                                         //if the user has not given a path then, add inputdir. else leave path alone.
210                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
211                                 }
212                 
213                 it = parameters.find("constaxonomy");
214                                 //user has given a template file
215                                 if(it != parameters.end()){ 
216                                         path = m->hasPath(it->second);
217                                         //if the user has not given a path then, add inputdir. else leave path alone.
218                                         if (path == "") {       parameters["constaxonomy"] = inputDir + it->second;             }
219                                 }
220                 
221                 it = parameters.find("reftaxonomy");
222                                 //user has given a template file
223                                 if(it != parameters.end()){
224                                         path = m->hasPath(it->second);
225                                         //if the user has not given a path then, add inputdir. else leave path alone.
226                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
227                                 }
228                 
229                 it = parameters.find("picrust");
230                                 //user has given a template file
231                                 if(it != parameters.end()){
232                                         path = m->hasPath(it->second);
233                                         //if the user has not given a path then, add inputdir. else leave path alone.
234                                         if (path == "") {       parameters["picrust"] = inputDir + it->second;          }
235                                 }
236                 
237                 it = parameters.find("metadata");
238                                 //user has given a template file
239                                 if(it != parameters.end()){ 
240                                         path = m->hasPath(it->second);
241                                         //if the user has not given a path then, add inputdir. else leave path alone.
242                                         if (path == "") {       parameters["metadata"] = inputDir + it->second;         }
243                                 }
244                         }
245             
246                         //get shared file
247                         sharedfile = validParameter.validFile(parameters, "shared", true);
248                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
249                         else if (sharedfile == "not found") { 
250                                 //if there is a current shared file, use it
251                                 sharedfile = m->getSharedFile(); 
252                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
253                                 else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
254                         }else { m->setSharedFile(sharedfile); }
255                         
256                         
257                         //if the user changes the output directory command factory will send this info to us in the output parameter 
258                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);             }
259             
260             contaxonomyfile = validParameter.validFile(parameters, "constaxonomy", true);
261                         if (contaxonomyfile == "not found") {  contaxonomyfile = "";  }
262                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
263             
264             referenceTax = validParameter.validFile(parameters, "reftaxonomy", true);
265                         if (referenceTax == "not found") {  referenceTax = "";  }
266                         else if (referenceTax == "not open") { referenceTax = ""; abort = true; }
267             
268             picrustOtuFile = validParameter.validFile(parameters, "picrust", true);
269                         if (picrustOtuFile == "not found") {  picrustOtuFile = "";  }
270                         else if (picrustOtuFile == "not open") { picrustOtuFile = ""; abort = true; }
271
272             metadatafile = validParameter.validFile(parameters, "metadata", true);
273                         if (metadatafile == "not found") {  metadatafile = "";  }
274                         else if (metadatafile == "not open") { metadatafile = ""; abort = true; }
275             
276                         //check for optional parameter and set defaults
277                         // ...at some point should added some additional type checking...
278                         label = validParameter.validFile(parameters, "label", false);                   
279                         if (label == "not found") { label = ""; }
280                         else { 
281                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
282                                 else { allLines = 1;  }
283                         }
284                         
285                         groups = validParameter.validFile(parameters, "groups", false);                 
286                         if (groups == "not found") { groups = ""; }
287                         else { 
288                                 m->splitAtDash(groups, Groups);
289                                 m->setGroups(Groups);
290                         }
291                         
292             if (picrustOtuFile != "") {
293                 picrust=true;
294                 if (contaxonomyfile == "") {  m->mothurOut("[ERROR]: the constaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true;  }
295                 if (referenceTax == "") {  m->mothurOut("[ERROR]: the reftaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true;  }
296             }else { picrust=false; }
297             
298             if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
299             
300                         format = validParameter.validFile(parameters, "matrixtype", false);                             if (format == "not found") { format = "sparse"; }
301                         
302                         if ((format != "sparse") && (format != "dense")) {
303                                 m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true; 
304                         }
305                 }
306         
307         }
308         catch(exception& e) {
309                 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
310                 exit(1);
311         }
312 }
313 //**********************************************************************************************************************
314
315 int MakeBiomCommand::execute(){
316         try {
317         
318                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
319             
320                 InputData input(sharedfile, "sharedfile");
321                 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
322                 string lastLabel = lookup[0]->getLabel();
323         
324         //if user did not specify a label, then use first one
325         if ((contaxonomyfile != "") && (labels.size() == 0)) {
326             allLines = 0;
327             labels.insert(lastLabel);
328         }
329                 
330         getSampleMetaData(lookup);
331         
332                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333                 set<string> processedLabels;
334                 set<string> userLabels = labels;
335         
336                 //as long as you are not at the end of the file or done wih the lines you want
337                 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
338                         
339                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
340             
341                         if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
342                 
343                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
344                                 getBiom(lookup);
345                                 
346                                 processedLabels.insert(lookup[0]->getLabel());
347                                 userLabels.erase(lookup[0]->getLabel());
348                         }
349                         
350                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
351                                 string saveLabel = lookup[0]->getLabel();
352                 
353                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
354                                 lookup = input.getSharedRAbundVectors(lastLabel);
355                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
356                                 
357                                 getBiom(lookup);
358                                 
359                                 processedLabels.insert(lookup[0]->getLabel());
360                                 userLabels.erase(lookup[0]->getLabel());
361                                 
362                                 //restore real lastlabel to save below
363                                 lookup[0]->setLabel(saveLabel);
364                         }
365                         
366                         lastLabel = lookup[0]->getLabel();
367             
368                         //prevent memory leak and get next set
369                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
370                         lookup = input.getSharedRAbundVectors();                                
371                 }
372                 
373         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); }  return 0; }     
374         
375                 //output error messages about any remaining user labels
376                 set<string>::iterator it;
377                 bool needToRun = false;
378                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
379                         m->mothurOut("Your file does not include the label " + *it); 
380                         if (processedLabels.count(lastLabel) != 1) {
381                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
382                                 needToRun = true;
383                         }else {
384                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
385                         }
386                 }
387         
388                 //run last label if you need to
389                 if (needToRun == true)  {
390                         for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
391                         lookup = input.getSharedRAbundVectors(lastLabel);
392                         
393                         m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
394             getBiom(lookup);
395                         
396                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
397                 }
398                 
399         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); }  return 0; }     
400                 
401         //set sabund file as new current sabundfile
402         string current = "";
403                 itTypes = outputTypes.find("biom");
404                 if (itTypes != outputTypes.end()) {
405                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
406                 }
407
408         
409                 m->mothurOutEndLine();
410                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
412                 m->mothurOutEndLine();
413                 
414                 return 0;
415         }
416         catch(exception& e) {
417                 m->errorOut(e, "MakeBiomCommand", "execute");
418                 exit(1);
419         }
420 }
421 //**********************************************************************************************************************
422 int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
423         try {
424         map<string, string> variables; 
425         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
426         variables["[distance]"] = lookup[0]->getLabel();
427         string outputFileName = getOutputFileName("biom",variables);
428                 ofstream out;
429                 m->openOutputFile(outputFileName, out);
430                 outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
431
432         string mothurString = "mothur" + toString(m->getVersion());
433         time_t rawtime;
434         struct tm * timeinfo;
435         time ( &rawtime );
436         timeinfo = localtime ( &rawtime );
437         string dateString = asctime (timeinfo);
438         int pos = dateString.find('\n');
439         if (pos != string::npos) { dateString = dateString.substr(0, pos);}
440         string spaces = "      ";
441         
442         //standard 
443         out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
444         out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
445         
446         
447         vector<string> metadata = getMetaData(lookup);
448         int numBins = lookup[0]->getNumBins();
449         
450         if (m->control_pressed) {  out.close(); return 0; }
451         
452         //get row info
453         /*"rows":[
454                 {"id":"GG_OTU_1", "metadata":null},
455                 {"id":"GG_OTU_2", "metadata":null},
456                 {"id":"GG_OTU_3", "metadata":null},
457                 {"id":"GG_OTU_4", "metadata":null},
458                 {"id":"GG_OTU_5", "metadata":null}
459                 ],*/
460         out << spaces + "\"rows\":[\n";
461         string rowFront = spaces + spaces + "{\"id\":\"";
462         string rowBack = "\", \"metadata\":";
463         for (int i = 0; i < numBins-1; i++) {
464             if (m->control_pressed) {  out.close(); return 0; }
465             out << rowFront << m->currentSharedBinLabels[i] << rowBack << metadata[i] << "},\n"; 
466         }
467         out << rowFront << m->currentSharedBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n"; 
468        
469         //get column info
470         /*"columns": [
471                     {"id":"Sample1", "metadata":null},
472                     {"id":"Sample2", "metadata":null},
473                     {"id":"Sample3", "metadata":null},
474                     {"id":"Sample4", "metadata":null},
475                     {"id":"Sample5", "metadata":null},
476                     {"id":"Sample6", "metadata":null}
477                     ],*/
478         
479         string colBack = "\", \"metadata\":";
480         out << spaces + "\"columns\":[\n";
481         for (int i = 0; i < lookup.size()-1; i++) {
482             if (m->control_pressed) {  out.close(); return 0; }
483             out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n";
484         }
485         out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n";
486         
487         out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
488         out <<  spaces + "\"shape\": [" << numBins << "," << lookup.size() << "],\n";
489         out << spaces + "\"data\":  [";
490         
491         vector<string> dataRows;
492         if (format == "sparse") {
493             /*"data":[[0,2,1],
494              [1,0,5],
495              [1,1,1],
496              [1,3,2],
497              [1,4,3],
498              [1,5,1],
499              [2,2,1],
500              [2,3,4],
501              [2,4,2],
502              [3,0,2],
503              [3,1,1],
504              [3,2,1],
505              [3,5,1],
506              [4,1,1],
507              [4,2,1]
508              ]*/
509             string output = "";
510             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
511                 
512                 if (m->control_pressed) { out.close(); return 0; }
513                 
514                 for (int j = 0; j < lookup.size(); j++) {
515                     string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
516                     //only print non zero values
517                     if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
518                 }
519             }
520         }else {
521             
522             /* "matrix_type": "dense",
523              "matrix_element_type": "int",
524              "shape": [5,6],
525              "data":  [[0,0,1,0,0,0],
526              [5,1,0,2,3,1],
527              [0,0,1,4,2,0],
528              [2,1,1,0,0,1],
529              [0,1,1,0,0,0]]*/
530             
531             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
532                 
533                 if (m->control_pressed) { out.close(); return 0; }
534                 
535                 string binInfo = "[";
536                 for (int j = 0; j < lookup.size()-1; j++) {
537                     binInfo += toString(lookup[j]->getAbundance(i)) + ",";
538                 }
539                 binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
540                 dataRows.push_back(binInfo);
541             }
542         }
543         
544         for (int i = 0; i < dataRows.size()-1; i++) {
545             out << dataRows[i] << ",\n" + spaces  + spaces;
546         }
547         out << dataRows[dataRows.size()-1] << "]\n";
548         
549         out << "}\n";
550         out.close();
551         
552         return 0;
553     }
554         catch(exception& e) {
555                 m->errorOut(e, "MakeBiomCommand", "getBiom");
556                 exit(1);
557         }
558 }
559 //**********************************************************************************************************************
560 vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
561         try {
562         vector<string> metadata;
563         
564         if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) {  metadata.push_back("null");  } }
565         else {
566             
567             //read constaxonomy file storing in a map, otulabel -> taxonomy
568             //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
569             ifstream in;
570             m->openInputFile(contaxonomyfile, in);
571             
572             //grab headers
573             m->getline(in); m->gobble(in);
574             
575             string otuLabel, tax;
576             int size;
577             vector<string> otuLabels;
578             vector<string> taxs;
579             while (!in.eof()) {
580                 
581                 if (m->control_pressed) { in.close(); return metadata; }
582                 
583                 in >> otuLabel >> size >> tax; m->gobble(in);
584                 
585                 otuLabels.push_back(otuLabel);
586                 taxs.push_back(tax);
587             }
588             in.close();
589             
590             //should the labels be Otu001 or PhyloType001
591             string firstBin = m->currentSharedBinLabels[0];
592             string binTag = "Otu";
593             if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType";  }
594             
595             //convert list file bin labels to shared file bin labels
596             //parse tax strings
597             //save in map
598             map<string, string> labelTaxMap;
599             string snumBins = toString(otuLabels.size());
600             for (int i = 0; i < otuLabels.size(); i++) {  
601                 
602                 if (m->control_pressed) { return metadata; }
603                 
604                 //if there is a bin label use it otherwise make one
605                 if (m->isContainingOnlyDigits(otuLabels[i])) {
606                     string binLabel = binTag;
607                     string sbinNumber = otuLabels[i];
608                     if (sbinNumber.length() < snumBins.length()) { 
609                         int diff = snumBins.length() - sbinNumber.length();
610                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
611                     }
612                     binLabel += sbinNumber;
613                     binLabel = m->getSimpleLabel(binLabel);
614                     labelTaxMap[binLabel] = taxs[i];
615                 }else {  labelTaxMap[m->getSimpleLabel(otuLabels[i])] = taxs[i]; }
616             }
617             
618             //merges OTUs classified to same gg otuid, sets otulabels to gg otuids, averages confidence scores of merged otus.  overwritting of otulabels is fine because constaxonomy only allows for one label to be processed.  If this assumption changes, could cause bug.
619             if (picrust) {  getGreenGenesOTUIDs(lookup, labelTaxMap);  }
620             
621             //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
622             
623             //traverse the binLabels forming the metadata strings and saving them
624             //make sure to sanity check
625             map<string, string>::iterator it;
626             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
627                 
628                 if (m->control_pressed) { return metadata; }
629                 
630                 it = labelTaxMap.find(m->getSimpleLabel(m->currentSharedBinLabels[i]));
631                 
632                 if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentSharedBinLabels[i] + ".\n"); m->control_pressed = true; }
633                 else {
634                     vector<string> bootstrapValues;
635                     string data = "{\"taxonomy\":[";
636             
637                     vector<string> scores;
638                     vector<string> taxonomies = parseTax(it->second, scores);
639                     
640                     for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
641                     data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
642                     
643                     //add bootstrap values if available
644                     if (scores[0] != "null") {
645                         data += ", \"bootstrap\":[";
646                         
647                         for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
648                         data += scores[scores.size()-1] + "]";
649
650                     }
651                     data += "}";
652                     
653                     metadata.push_back(data);
654                 }
655             }
656         }
657         
658         return metadata;
659         
660     }
661         catch(exception& e) {
662                 m->errorOut(e, "MakeBiomCommand", "getMetadata");
663                 exit(1);
664         }
665
666 }
667 //**********************************************************************************************************************
668 int MakeBiomCommand::getGreenGenesOTUIDs(vector<SharedRAbundVector*>& lookup, map<string, string>& labelTaxMap){
669         try {
670         //read reftaxonomy
671         PhyloTree phylo(referenceTax);
672         
673         //read otu map file
674         map<string, string> otuMap = readGGOtuMap(); //maps reference ID -> OTU ID
675         
676         if (m->control_pressed) { return 0; }
677         
678         map<string, vector<string> > ggOTUIDs;
679         //loop through otu taxonomies
680         for (map<string, string>::iterator it = labelTaxMap.begin(); it != labelTaxMap.end(); it++) { //maps label -> consensus taxonomy
681             if (m->control_pressed) { break; }
682             
683             //get list of reference ids that map to this taxonomy
684             vector<string> referenceIds = phylo.getSeqs(it->second);
685             
686             if (m->control_pressed) { break; }
687             
688             //look for each one in otu map to find match
689             string otuID = "not found";
690             string referenceString = "";
691             for (int i = 0; i < referenceIds.size(); i++) {
692                 referenceString += referenceIds[i] + " ";
693                 map<string, string>::iterator itMap = otuMap.find(referenceIds[i]);
694                 if (itMap != otuMap.end()) { //found it
695                     otuID = itMap->second;
696                     i += referenceIds.size(); //stop looking
697                 }
698             }
699             
700             //if found, add otu to ggOTUID list
701             if (otuID != "not found") {
702                 map<string, vector<string> >::iterator itGG = ggOTUIDs.find(otuID);
703                 if (itGG == ggOTUIDs.end()) {
704                     vector<string> temp; temp.push_back(it->first); //save mothur OTU label
705                     ggOTUIDs[otuID] = temp;
706                 }else { ggOTUIDs[otuID].push_back(it->first); } //add mothur OTU label to list
707             }else {  m->mothurOut("[ERROR]: could not find OTUId for " + it->second + ". Its reference sequences are " + referenceString + ".\n"); m->control_pressed = true; }
708             
709         }
710         
711        
712         vector<SharedRAbundVector*> newLookup;
713                 for (int i = 0; i < lookup.size(); i++) {
714                         SharedRAbundVector* temp = new SharedRAbundVector();
715                         temp->setLabel(lookup[i]->getLabel());
716                         temp->setGroup(lookup[i]->getGroup());
717                         newLookup.push_back(temp);
718                 }
719                 
720         map<string, int> labelIndex;
721                 for (int i = 0; i < m->currentSharedBinLabels.size(); i++) {  labelIndex[m->getSimpleLabel(m->currentSharedBinLabels[i])] = i; }
722         
723         vector<string> newBinLabels;
724         map<string, string> newLabelTaxMap;
725         //loop through ggOTUID list combining mothur otus and adjusting labels
726         //ggOTUIDs = 16097 -> <OTU01, OTU10, OTU22>
727         for (map<string, vector<string> >::iterator itMap = ggOTUIDs.begin(); itMap != ggOTUIDs.end(); itMap++) {
728                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
729             
730             //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
731             //find taxonomy of this otu
732             map<string, string>::iterator it = labelTaxMap.find(m->getSimpleLabel(itMap->second[0]));
733             vector<string> scores;
734             vector<string> taxonomies = parseTax(it->second, scores);
735             
736             //merge/set OTU abundances
737             vector<int> abunds; abunds.resize(lookup.size(), 0);
738             string mergeString = "";
739             vector<float> boots; boots.resize(scores.size(), 0);
740             for (int j = 0; j < itMap->second.size(); j++) { //<OTU01, OTU10, OTU22>
741                 //merge bootstrap scores
742                 vector<string> scores;
743                 vector<string> taxonomies = parseTax(it->second, scores);
744                 for (int i = 0; i < boots.size(); i++) {
745                     float tempScore; m->mothurConvert(scores[i], tempScore);
746                     boots[i] += tempScore;
747                 }
748                 
749                 //merge abunds
750                 mergeString += (itMap->second)[j] + " ";
751                 for (int i = 0; i < lookup.size(); i++) {
752                     abunds[i] += lookup[i]->getAbundance(labelIndex[m->getSimpleLabel((itMap->second)[j])]);
753                 }
754             }
755             
756             if (m->debug) { m->mothurOut("[DEBUG]: merging " + mergeString + " for ggOTUid = " + itMap->first + ".\n");  }
757             
758             //average scores
759             //add merged otu to new lookup
760             for (int j = 0; j < boots.size(); j++) { boots[j] /= (float) itMap->second.size(); }
761             
762             //assemble new taxomoy
763             string newTaxString = "";
764             for (int j = 0; j < boots.size(); j++) {
765                 newTaxString += taxonomies[j] + "(" + toString(boots[j]) + ");";
766             }
767
768             //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
769             //find taxonomy of this otu
770             newLabelTaxMap[itMap->first] = newTaxString;
771             
772             //add merged otu to new lookup
773             for (int j = 0; j < abunds.size(); j++) { newLookup[j]->push_back(abunds[j], newLookup[j]->getGroup()); }
774             
775             //saved otu label
776             newBinLabels.push_back(itMap->first);
777         }
778                 
779                 for (int j = 0; j < lookup.size(); j++) {  delete lookup[j];  }
780                 
781                 lookup = newLookup;
782                 m->currentSharedBinLabels = newBinLabels;
783         labelTaxMap = newLabelTaxMap;
784         
785         map<string, string> variables;
786         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
787         variables["[distance]"] = lookup[0]->getLabel();
788         string outputFileName = getOutputFileName("shared",variables);
789                 ofstream out;
790                 m->openOutputFile(outputFileName, out);
791                 outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
792         
793         lookup[0]->printHeaders(out);
794                 
795                 for (int i = 0; i < lookup.size(); i++) {
796                         out << lookup[i]->getLabel() << '\t' << lookup[i]->getGroup() << '\t';
797                         lookup[i]->print(out);
798                 }
799                 out.close();
800
801         return 0;
802     }
803         catch(exception& e) {
804                 m->errorOut(e, "MakeBiomCommand", "getGreenGenesOTUIDs");
805                 exit(1);
806         }
807     
808 }
809 //**********************************************************************************************************************
810 map<string, string> MakeBiomCommand::readGGOtuMap(){
811         try {
812         map<string, string> otuMap;
813         
814         ifstream in;
815         m->openInputFile(picrustOtuFile, in);
816         
817         //map referenceIDs -> otuIDs
818         //lines look like:
819         //16097 671376  616121  533566  683683  4332909 4434717 772666  611808  695209
820         while(!in.eof()) {
821             if (m->control_pressed) { break; }
822             
823             string line = m->getline(in); m->gobble(in);
824             vector<string> pieces = m->splitWhiteSpace(line);
825             
826             if (pieces.size() != 0) {
827                 string otuID = pieces[0];
828                 for (int i = 1; i < pieces.size(); i++) {  otuMap[pieces[i]] = otuID; }
829             }
830         }
831         in.close();
832         
833         return otuMap;
834     }
835         catch(exception& e) {
836                 m->errorOut(e, "MakeBiomCommand", "readGGOtuMap");
837                 exit(1);
838         }
839     
840 }
841 //**********************************************************************************************************************
842 int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
843         try {
844         sampleMetadata.clear();
845         if (metadatafile == "") {  for (int i = 0; i < lookup.size(); i++) {  sampleMetadata.push_back("null");  } }
846         else {
847             ifstream in;
848             m->openInputFile(metadatafile, in);
849             
850             vector<string> groupNames, metadataLabels;
851             map<string, vector<string> > lines;
852             
853             string headerLine = m->getline(in); m->gobble(in);
854             vector<string> pieces = m->splitWhiteSpace(headerLine);
855             
856             //save names of columns you are reading
857             for (int i = 1; i < pieces.size(); i++) {
858                 metadataLabels.push_back(pieces[i]);
859             }
860             int count = metadataLabels.size();
861                         
862             vector<string> groups = m->getGroups();
863             
864             //read rest of file
865             while (!in.eof()) {
866                 
867                 if (m->control_pressed) { in.close(); return 0; }
868                 
869                 string group = "";
870                 in >> group; m->gobble(in);
871                 groupNames.push_back(group);
872                 
873                 string line = m->getline(in); m->gobble(in);
874                 vector<string> thisPieces = m->splitWhiteSpaceWithQuotes(line);
875         
876                 if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); }
877                 else {  if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } }
878                 
879                 m->gobble(in);
880             }
881             in.close();
882             
883             map<string, vector<string> >::iterator it;
884             for (int i = 0; i < lookup.size(); i++) {
885                 
886                 if (m->control_pressed) { return 0; }
887                 
888                 it = lines.find(lookup[i]->getGroup());
889                 
890                 if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; }
891                 else {
892                     vector<string> values = it->second;
893                     
894                     string data = "{";
895                     for (int j = 0; j < metadataLabels.size()-1; j++) { 
896                         values[j] = m->removeQuotes(values[j]); 
897                         data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", "; 
898                     }
899                     values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]);
900                     data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}";
901                     sampleMetadata.push_back(data);
902                 }
903             }
904         }
905         
906         return 0;
907         
908     }
909         catch(exception& e) {
910                 m->errorOut(e, "MakeBiomCommand", "getSampleMetaData");
911                 exit(1);
912         }
913     
914 }
915
916 /**************************************************************************************************/
917 //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
918 vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
919         try {
920                 
921                 string taxon;
922         vector<string> taxs;
923                 
924                 while (tax.find_first_of(';') != -1) {
925                         
926                         if (m->control_pressed) { return taxs; }
927                         
928                         //get taxon
929                         taxon = tax.substr(0,tax.find_first_of(';'));
930             
931                         int pos = taxon.find_last_of('(');
932                         if (pos != -1) {
933                                 //is it a number?
934                                 int pos2 = taxon.find_last_of(')');
935                                 if (pos2 != -1) {
936                                         string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
937                                         if (m->isNumeric1(confidenceScore)) {
938                                                 taxon = taxon.substr(0, pos); //rip off confidence 
939                         scores.push_back(confidenceScore);
940                                         }else{ scores.push_back("null"); }
941                                 }
942                         }else{ scores.push_back("null"); }
943                         
944             //strip "" if they are there
945             pos = taxon.find("\"");
946             if (pos != string::npos) {
947                 string newTax = "";
948                 for (int k = 0; k < taxon.length(); k++) {
949                     if (taxon[k] != '\"') { newTax += taxon[k]; }
950                 }
951                 taxon = newTax;
952             }
953             
954             //look for bootstrap value
955                         taxs.push_back(taxon);
956             tax = tax.substr(tax.find_first_of(';')+1, tax.length());
957                 }
958                 
959                 return taxs;
960         }
961         catch(exception& e) {
962                 m->errorOut(e, "MakeBiomCommand", "parseTax");
963                 exit(1);
964         }
965 }
966
967 //**********************************************************************************************************************
968
969
970