]> git.donarmstrong.com Git - mothur.git/blob - makebiomcommand.cpp
added oligos class. added check orient parameter to trim.flows, sffinfo, fastq.info...
[mothur.git] / makebiomcommand.cpp
1 //
2 //  makebiomcommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 4/16/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "makebiomcommand.h"
10 #include "sharedrabundvector.h"
11 #include "inputdata.h"
12 #include "sharedutilities.h"
13 #include "phylotree.h"
14
15 //taken from http://biom-format.org/documentation/biom_format.html
16 /* Minimal Sparse 
17  {
18  "id":null,
19  "format": "Biological Observation Matrix 0.9.1",
20  "format_url": "http://biom-format.org",
21  "type": "OTU table",
22  "generated_by": "QIIME revision 1.4.0-dev",
23  "date": "2011-12-19T19:00:00",
24  "rows":[
25  {"id":"GG_OTU_1", "metadata":null},
26  {"id":"GG_OTU_2", "metadata":null},
27  {"id":"GG_OTU_3", "metadata":null},
28  {"id":"GG_OTU_4", "metadata":null},
29  {"id":"GG_OTU_5", "metadata":null}
30  ],
31  "columns": [
32  {"id":"Sample1", "metadata":null},
33  {"id":"Sample2", "metadata":null},
34  {"id":"Sample3", "metadata":null},
35  {"id":"Sample4", "metadata":null},
36  {"id":"Sample5", "metadata":null},
37  {"id":"Sample6", "metadata":null}
38  ],
39  "matrix_type": "sparse",
40  "matrix_element_type": "int",
41  "shape": [5, 6],
42  "data":[[0,2,1],
43  [1,0,5],
44  [1,1,1],
45  [1,3,2],
46  [1,4,3],
47  [1,5,1],
48  [2,2,1],
49  [2,3,4],
50  [2,4,2],
51  [3,0,2],
52  [3,1,1],
53  [3,2,1],
54  [3,5,1],
55  [4,1,1],
56  [4,2,1]
57  ]
58  }
59  */
60 /* Minimal dense
61  {
62  "id":null,
63  "format": "Biological Observation Matrix 0.9.1",
64  "format_url": "http://biom-format.org",
65  "type": "OTU table",
66  "generated_by": "QIIME revision 1.4.0-dev",
67  "date": "2011-12-19T19:00:00",
68  "rows":[
69  {"id":"GG_OTU_1", "metadata":null},
70  {"id":"GG_OTU_2", "metadata":null},
71  {"id":"GG_OTU_3", "metadata":null},
72  {"id":"GG_OTU_4", "metadata":null},
73  {"id":"GG_OTU_5", "metadata":null}
74  ],
75  "columns": [
76  {"id":"Sample1", "metadata":null},
77  {"id":"Sample2", "metadata":null},
78  {"id":"Sample3", "metadata":null},
79  {"id":"Sample4", "metadata":null},
80  {"id":"Sample5", "metadata":null},
81  {"id":"Sample6", "metadata":null}
82  ],
83  "matrix_type": "dense",
84  "matrix_element_type": "int",
85  "shape": [5,6],
86  "data":  [[0,0,1,0,0,0],
87  [5,1,0,2,3,1],
88  [0,0,1,4,2,0],
89  [2,1,1,0,0,1],
90  [0,1,1,0,0,0]]
91  }
92  */
93 //**********************************************************************************************************************
94 vector<string> MakeBiomCommand::setParameters(){        
95         try {
96                 CommandParameter pshared("shared", "InputTypes", "", "", "none", "none", "none","biom",false,true,true); parameters.push_back(pshared);
97         CommandParameter pcontaxonomy("constaxonomy", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pcontaxonomy);
98         CommandParameter preference("reftaxonomy", "InputTypes", "", "", "none", "none", "refPi","",false,false); parameters.push_back(preference);
99         CommandParameter pmetadata("metadata", "InputTypes", "", "", "none", "none", "none","",false,false); parameters.push_back(pmetadata);
100                 CommandParameter pgroups("groups", "String", "", "", "", "", "","",false,false); parameters.push_back(pgroups);
101                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
102         CommandParameter ppicrust("picrust", "InputTypes", "", "", "none", "none", "refPi","shared",false,false); parameters.push_back(ppicrust);
103         CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
104                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
105         CommandParameter pmatrixtype("matrixtype", "Multiple", "sparse-dense", "sparse", "", "", "","",false,false); parameters.push_back(pmatrixtype);
106
107                 vector<string> myArray;
108                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
109                 return myArray;
110         }
111         catch(exception& e) {
112                 m->errorOut(e, "MakeBiomCommand", "setParameters");
113                 exit(1);
114         }
115 }
116 //**********************************************************************************************************************
117 string MakeBiomCommand::getHelpString(){        
118         try {
119                 string helpString = "";
120                 helpString += "The make.biom command parameters are shared, contaxonomy, metadata, groups, matrixtype, picrust, reftaxonomy and label.  shared is required, unless you have a valid current file.\n"; //
121                 helpString += "The groups parameter allows you to specify which of the groups in your groupfile you would like included. The group names are separated by dashes.\n";
122                 helpString += "The label parameter allows you to select what distance levels you would like, and are also separated by dashes.\n";
123                 helpString += "The matrixtype parameter allows you to select what type you would like to make. Choices are sparse and dense, default is sparse.\n";
124         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile). Be SURE that the you are the constaxonomy file distance matches the shared file distance.  ie, for *.0.03.cons.taxonomy set label=0.03. Mothur is smart enough to handle shared files that have been subsampled. It is used to assign taxonomy information to the metadata of rows.\n";
125         helpString += "The metadata parameter is used to provide experimental parameters to the columns.  Things like 'sample1 gut human_gut'. \n";
126         helpString += "The picrust parameter is used to provide the greengenes OTU IDs map table.  NOTE: Picrust requires a greengenes taxonomy. \n";
127         helpString += "The referencetax parameter is used with the picrust parameter.  Picrust requires the greengenes OTU IDs to be in the biom file. \n";
128                 helpString += "The make.biom command should be in the following format: make.biom(shared=yourShared, groups=yourGroups, label=yourLabels).\n";
129                 helpString += "Example make.biom(shared=abrecovery.an.shared, groups=A-B-C).\n";
130                 helpString += "The default value for groups is all the groups in your groupfile, and all labels in your inputfile will be used.\n";
131                 helpString += "The make.biom command outputs a .biom file.\n";
132                 helpString += "Note: No spaces between parameter labels (i.e. groups), '=' and parameters (i.e.yourGroups).\n";
133                 return helpString;
134         }
135         catch(exception& e) {
136                 m->errorOut(e, "MakeBiomCommand", "getHelpString");
137                 exit(1);
138         }
139 }
140 //**********************************************************************************************************************
141 string MakeBiomCommand::getOutputPattern(string type) {
142     try {
143         string pattern = "";
144         
145         if (type == "biom") {  pattern = "[filename],[distance],biom"; }
146         else if (type == "shared") {  pattern = "[filename],[distance],biom_shared"; }
147         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
148         
149         return pattern;
150     }
151     catch(exception& e) {
152         m->errorOut(e, "MakeBiomCommand", "getOutputPattern");
153         exit(1);
154     }
155 }
156
157 //**********************************************************************************************************************
158 MakeBiomCommand::MakeBiomCommand(){     
159         try {
160                 abort = true; calledHelp = true; 
161                 setParameters();
162                 vector<string> tempOutNames;
163                 outputTypes["biom"] = tempOutNames;
164         outputTypes["shared"] = tempOutNames;
165         }
166         catch(exception& e) {
167                 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
168                 exit(1);
169         }
170 }
171 //**********************************************************************************************************************
172 MakeBiomCommand::MakeBiomCommand(string option) {
173         try {
174                 abort = false; calledHelp = false;   
175                 allLines = 1;
176         
177                 //allow user to run help
178                 if(option == "help") { help(); abort = true; calledHelp = true; }
179                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
180                 
181                 else {
182                         vector<string> myArray = setParameters();
183                         
184                         OptionParser parser(option);
185                         map<string,string> parameters = parser.getParameters();
186                         map<string,string>::iterator it;
187                         
188                         ValidParameters validParameter;
189                         
190                         //check to make sure all parameters are valid for command
191                         for (it = parameters.begin(); it != parameters.end(); it++) { 
192                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
193                         }
194
195                         //initialize outputTypes
196                         vector<string> tempOutNames;
197                         outputTypes["biom"] = tempOutNames;
198             outputTypes["shared"] = tempOutNames;
199                         
200                         //if the user changes the input directory command factory will send this info to us in the output parameter 
201                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
202                         if (inputDir == "not found"){   inputDir = "";          }
203                         else {
204                                 string path;
205                                 it = parameters.find("shared");
206                                 //user has given a template file
207                                 if(it != parameters.end()){ 
208                                         path = m->hasPath(it->second);
209                                         //if the user has not given a path then, add inputdir. else leave path alone.
210                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
211                                 }
212                 
213                 it = parameters.find("constaxonomy");
214                                 //user has given a template file
215                                 if(it != parameters.end()){ 
216                                         path = m->hasPath(it->second);
217                                         //if the user has not given a path then, add inputdir. else leave path alone.
218                                         if (path == "") {       parameters["constaxonomy"] = inputDir + it->second;             }
219                                 }
220                 
221                 it = parameters.find("reftaxonomy");
222                                 //user has given a template file
223                                 if(it != parameters.end()){
224                                         path = m->hasPath(it->second);
225                                         //if the user has not given a path then, add inputdir. else leave path alone.
226                                         if (path == "") {       parameters["reftaxonomy"] = inputDir + it->second;              }
227                                 }
228                 
229                 it = parameters.find("picrust");
230                                 //user has given a template file
231                                 if(it != parameters.end()){
232                                         path = m->hasPath(it->second);
233                                         //if the user has not given a path then, add inputdir. else leave path alone.
234                                         if (path == "") {       parameters["picrust"] = inputDir + it->second;          }
235                                 }
236                 
237                 it = parameters.find("metadata");
238                                 //user has given a template file
239                                 if(it != parameters.end()){ 
240                                         path = m->hasPath(it->second);
241                                         //if the user has not given a path then, add inputdir. else leave path alone.
242                                         if (path == "") {       parameters["metadata"] = inputDir + it->second;         }
243                                 }
244                         }
245             
246                         //get shared file
247                         sharedfile = validParameter.validFile(parameters, "shared", true);
248                         if (sharedfile == "not open") { sharedfile = ""; abort = true; }        
249                         else if (sharedfile == "not found") { 
250                                 //if there is a current shared file, use it
251                                 sharedfile = m->getSharedFile(); 
252                                 if (sharedfile != "") { m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
253                                 else {  m->mothurOut("You have no current sharedfile and the shared parameter is required."); m->mothurOutEndLine(); abort = true; }
254                         }else { m->setSharedFile(sharedfile); }
255                         
256                         
257                         //if the user changes the output directory command factory will send this info to us in the output parameter 
258                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = m->hasPath(sharedfile);             }
259             
260             contaxonomyfile = validParameter.validFile(parameters, "constaxonomy", true);
261                         if (contaxonomyfile == "not found") {  contaxonomyfile = "";  }
262                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
263             
264             referenceTax = validParameter.validFile(parameters, "reftaxonomy", true);
265                         if (referenceTax == "not found") {  referenceTax = "";  }
266                         else if (referenceTax == "not open") { referenceTax = ""; abort = true; }
267             
268             picrustOtuFile = validParameter.validFile(parameters, "picrust", true);
269                         if (picrustOtuFile == "not found") {  picrustOtuFile = "";  }
270                         else if (picrustOtuFile == "not open") { picrustOtuFile = ""; abort = true; }
271
272             metadatafile = validParameter.validFile(parameters, "metadata", true);
273                         if (metadatafile == "not found") {  metadatafile = "";  }
274                         else if (metadatafile == "not open") { metadatafile = ""; abort = true; }
275             
276                         //check for optional parameter and set defaults
277                         // ...at some point should added some additional type checking...
278                         label = validParameter.validFile(parameters, "label", false);                   
279                         if (label == "not found") { label = ""; }
280                         else { 
281                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
282                                 else { allLines = 1;  }
283                         }
284                         
285                         groups = validParameter.validFile(parameters, "groups", false);                 
286                         if (groups == "not found") { groups = ""; }
287                         else { 
288                                 m->splitAtDash(groups, Groups);
289                                 m->setGroups(Groups);
290                         }
291                         
292             if (picrustOtuFile != "") {
293                 picrust=true;
294                 if (contaxonomyfile == "") {  m->mothurOut("[ERROR]: the constaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true;  }
295                 if (referenceTax == "") {  m->mothurOut("[ERROR]: the reftaxonomy parameter is required with the picrust parameter, aborting."); m->mothurOutEndLine(); abort = true;  }
296             }else { picrust=false; }
297             
298             if ((contaxonomyfile != "") && (labels.size() > 1)) { m->mothurOut("[ERROR]: the contaxonomy parameter cannot be used with multiple labels."); m->mothurOutEndLine(); abort = true; }
299             
300                         format = validParameter.validFile(parameters, "matrixtype", false);                             if (format == "not found") { format = "sparse"; }
301                         
302                         if ((format != "sparse") && (format != "dense")) {
303                                 m->mothurOut(format + " is not a valid option for the matrixtype parameter. Options are sparse and dense."); m->mothurOutEndLine(); abort = true; 
304                         }
305                 }
306         
307         }
308         catch(exception& e) {
309                 m->errorOut(e, "MakeBiomCommand", "MakeBiomCommand");
310                 exit(1);
311         }
312 }
313 //**********************************************************************************************************************
314
315 int MakeBiomCommand::execute(){
316         try {
317         
318                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
319             
320                 InputData input(sharedfile, "sharedfile");
321                 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
322                 string lastLabel = lookup[0]->getLabel();
323         
324         //if user did not specify a label, then use first one
325         if ((contaxonomyfile != "") && (labels.size() == 0)) {
326             allLines = 0;
327             labels.insert(lastLabel);
328         }
329                 
330         getSampleMetaData(lookup);
331         
332                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
333                 set<string> processedLabels;
334                 set<string> userLabels = labels;
335         
336                 //as long as you are not at the end of the file or done wih the lines you want
337                 while((lookup[0] != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
338                         
339                         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); } for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  return 0; }
340             
341                         if(allLines == 1 || labels.count(lookup[0]->getLabel()) == 1){                  
342                 
343                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
344                                 getBiom(lookup);
345                                 
346                                 processedLabels.insert(lookup[0]->getLabel());
347                                 userLabels.erase(lookup[0]->getLabel());
348                         }
349                         
350                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
351                                 string saveLabel = lookup[0]->getLabel();
352                 
353                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }  
354                                 lookup = input.getSharedRAbundVectors(lastLabel);
355                                 m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
356                                 
357                                 getBiom(lookup);
358                                 
359                                 processedLabels.insert(lookup[0]->getLabel());
360                                 userLabels.erase(lookup[0]->getLabel());
361                                 
362                                 //restore real lastlabel to save below
363                                 lookup[0]->setLabel(saveLabel);
364                         }
365                         
366                         lastLabel = lookup[0]->getLabel();
367             
368                         //prevent memory leak and get next set
369                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i]; lookup[i] = NULL; }
370                         lookup = input.getSharedRAbundVectors();                                
371                 }
372                 
373         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); }  return 0; }     
374         
375                 //output error messages about any remaining user labels
376                 set<string>::iterator it;
377                 bool needToRun = false;
378                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
379                         m->mothurOut("Your file does not include the label " + *it); 
380                         if (processedLabels.count(lastLabel) != 1) {
381                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
382                                 needToRun = true;
383                         }else {
384                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
385                         }
386                 }
387         
388                 //run last label if you need to
389                 if (needToRun == true)  {
390                         for (int i = 0; i < lookup.size(); i++) { if (lookup[i] != NULL) { delete lookup[i]; } }  
391                         lookup = input.getSharedRAbundVectors(lastLabel);
392                         
393                         m->mothurOut(lookup[0]->getLabel()); m->mothurOutEndLine();
394             getBiom(lookup);
395                         
396                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
397                 }
398                 
399         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]); }  return 0; }     
400                 
401         //set sabund file as new current sabundfile
402         string current = "";
403                 itTypes = outputTypes.find("biom");
404                 if (itTypes != outputTypes.end()) {
405                         if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setBiomFile(current); }
406                 }
407
408         
409                 m->mothurOutEndLine();
410                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
411                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
412                 m->mothurOutEndLine();
413                 
414                 return 0;
415         }
416         catch(exception& e) {
417                 m->errorOut(e, "MakeBiomCommand", "execute");
418                 exit(1);
419         }
420 }
421 //**********************************************************************************************************************
422 int MakeBiomCommand::getBiom(vector<SharedRAbundVector*>& lookup){
423         try {
424         map<string, string> variables; 
425         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
426         variables["[distance]"] = lookup[0]->getLabel();
427         string outputFileName = getOutputFileName("biom",variables);
428                 ofstream out;
429                 m->openOutputFile(outputFileName, out);
430                 outputNames.push_back(outputFileName); outputTypes["biom"].push_back(outputFileName);
431
432         string mothurString = "mothur" + toString(m->getVersion());
433         time_t rawtime;
434         struct tm * timeinfo;
435         time ( &rawtime );
436         timeinfo = localtime ( &rawtime );
437         string dateString = asctime (timeinfo);
438         int pos = dateString.find('\n');
439         if (pos != string::npos) { dateString = dateString.substr(0, pos);}
440         string spaces = "      ";
441         
442         //standard 
443         out << "{\n" + spaces + "\"id\":\"" + sharedfile + "-" + lookup[0]->getLabel() + "\",\n" + spaces + "\"format\": \"Biological Observation Matrix 0.9.1\",\n" + spaces + "\"format_url\": \"http://biom-format.org\",\n";
444         out << spaces + "\"type\": \"OTU table\",\n" + spaces + "\"generated_by\": \"" << mothurString << "\",\n" + spaces + "\"date\": \"" << dateString << "\",\n";
445         
446         
447         vector<string> metadata = getMetaData(lookup);
448         int numBins = lookup[0]->getNumBins();
449         
450         if (m->control_pressed) {  out.close(); return 0; }
451         
452         //get row info
453         /*"rows":[
454                 {"id":"GG_OTU_1", "metadata":null},
455                 {"id":"GG_OTU_2", "metadata":null},
456                 {"id":"GG_OTU_3", "metadata":null},
457                 {"id":"GG_OTU_4", "metadata":null},
458                 {"id":"GG_OTU_5", "metadata":null}
459                 ],*/
460         out << spaces + "\"rows\":[\n";
461         string rowFront = spaces + spaces + "{\"id\":\"";
462         string rowBack = "\", \"metadata\":";
463         for (int i = 0; i < numBins-1; i++) {
464             if (m->control_pressed) {  out.close(); return 0; }
465             out << rowFront << m->currentSharedBinLabels[i] << rowBack << metadata[i] << "},\n"; 
466         }
467         out << rowFront << m->currentSharedBinLabels[(numBins-1)] << rowBack << metadata[(numBins-1)] << "}\n" + spaces + "],\n"; 
468        
469         //get column info
470         /*"columns": [
471                     {"id":"Sample1", "metadata":null},
472                     {"id":"Sample2", "metadata":null},
473                     {"id":"Sample3", "metadata":null},
474                     {"id":"Sample4", "metadata":null},
475                     {"id":"Sample5", "metadata":null},
476                     {"id":"Sample6", "metadata":null}
477                     ],*/
478         
479         string colBack = "\", \"metadata\":";
480         out << spaces + "\"columns\":[\n";
481         for (int i = 0; i < lookup.size()-1; i++) {
482             if (m->control_pressed) {  out.close(); return 0; }
483             out << rowFront << lookup[i]->getGroup() << colBack << sampleMetadata[i] << "},\n";
484         }
485         out << rowFront << lookup[(lookup.size()-1)]->getGroup() << colBack << sampleMetadata[lookup.size()-1] << "}\n" + spaces + "],\n";
486         
487         out << spaces + "\"matrix_type\": \"" << format << "\",\n" + spaces + "\"matrix_element_type\": \"int\",\n";
488         out <<  spaces + "\"shape\": [" << numBins << "," << lookup.size() << "],\n";
489         out << spaces + "\"data\":  [";
490         
491         vector<string> dataRows;
492         if (format == "sparse") {
493             /*"data":[[0,2,1],
494              [1,0,5],
495              [1,1,1],
496              [1,3,2],
497              [1,4,3],
498              [1,5,1],
499              [2,2,1],
500              [2,3,4],
501              [2,4,2],
502              [3,0,2],
503              [3,1,1],
504              [3,2,1],
505              [3,5,1],
506              [4,1,1],
507              [4,2,1]
508              ]*/
509             string output = "";
510             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
511                 
512                 if (m->control_pressed) { out.close(); return 0; }
513                 
514                 for (int j = 0; j < lookup.size(); j++) {
515                     string binInfo = "[" + toString(i) + "," + toString(j) + "," + toString(lookup[j]->getAbundance(i)) + "]";
516                     //only print non zero values
517                     if (lookup[j]->getAbundance(i) != 0) { dataRows.push_back(binInfo); }
518                 }
519             }
520         }else {
521             
522             /* "matrix_type": "dense",
523              "matrix_element_type": "int",
524              "shape": [5,6],
525              "data":  [[0,0,1,0,0,0],
526              [5,1,0,2,3,1],
527              [0,0,1,4,2,0],
528              [2,1,1,0,0,1],
529              [0,1,1,0,0,0]]*/
530             
531             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
532                 
533                 if (m->control_pressed) { out.close(); return 0; }
534                 
535                 string binInfo = "[";
536                 for (int j = 0; j < lookup.size()-1; j++) {
537                     binInfo += toString(lookup[j]->getAbundance(i)) + ",";
538                 }
539                 binInfo += toString(lookup[lookup.size()-1]->getAbundance(i)) + "]";
540                 dataRows.push_back(binInfo);
541             }
542         }
543         
544         for (int i = 0; i < dataRows.size()-1; i++) {
545             out << dataRows[i] << ",\n" + spaces  + spaces;
546         }
547         out << dataRows[dataRows.size()-1] << "]\n";
548         
549         out << "}\n";
550         out.close();
551         
552         return 0;
553     }
554         catch(exception& e) {
555                 m->errorOut(e, "MakeBiomCommand", "getBiom");
556                 exit(1);
557         }
558 }
559 //**********************************************************************************************************************
560 vector<string> MakeBiomCommand::getMetaData(vector<SharedRAbundVector*>& lookup){
561         try {
562         vector<string> metadata;
563         
564         if (contaxonomyfile == "") { for (int i = 0; i < lookup[0]->getNumBins(); i++) {  metadata.push_back("null");  } }
565         else {
566             
567             //read constaxonomy file storing in a map, otulabel -> taxonomy
568             //constaxonomy file will most likely contain more labels than the shared file, because sharedfile could have been subsampled.
569             ifstream in;
570             m->openInputFile(contaxonomyfile, in);
571             
572             //grab headers
573             m->getline(in); m->gobble(in);
574             
575             string otuLabel, tax;
576             int size;
577             vector<string> otuLabels;
578             vector<string> taxs;
579             while (!in.eof()) {
580                 
581                 if (m->control_pressed) { in.close(); return metadata; }
582                 
583                 in >> otuLabel >> size >> tax; m->gobble(in);
584                 
585                 otuLabels.push_back(otuLabel);
586                 taxs.push_back(tax);
587             }
588             in.close();
589             
590             //should the labels be Otu001 or PhyloType001
591             string firstBin = m->currentSharedBinLabels[0];
592             string binTag = "Otu";
593             if ((firstBin.find("Otu")) == string::npos) { binTag = "PhyloType";  }
594             
595             //convert list file bin labels to shared file bin labels
596             //parse tax strings
597             //save in map
598             map<string, string> labelTaxMap;
599             string snumBins = toString(otuLabels.size());
600             for (int i = 0; i < otuLabels.size(); i++) {  
601                 
602                 if (m->control_pressed) { return metadata; }
603                 
604                 //if there is a bin label use it otherwise make one
605                 if (m->isContainingOnlyDigits(otuLabels[i])) {
606                     string binLabel = binTag;
607                     string sbinNumber = otuLabels[i];
608                     if (sbinNumber.length() < snumBins.length()) { 
609                         int diff = snumBins.length() - sbinNumber.length();
610                         for (int h = 0; h < diff; h++) { binLabel += "0"; }
611                     }
612                     binLabel += sbinNumber;
613                     binLabel = m->getSimpleLabel(binLabel);
614                     labelTaxMap[binLabel] = taxs[i];
615                 }else {  labelTaxMap[m->getSimpleLabel(otuLabels[i])] = taxs[i]; }
616             }
617             
618             //merges OTUs classified to same gg otuid, sets otulabels to gg otuids, averages confidence scores of merged otus.  overwritting of otulabels is fine because constaxonomy only allows for one label to be processed.  If this assumption changes, could cause bug.
619             if (picrust) {  getGreenGenesOTUIDs(lookup, labelTaxMap);  }
620             
621             //{"taxonomy":["k__Bacteria", "p__Proteobacteria", "c__Gammaproteobacteria", "o__Enterobacteriales", "f__Enterobacteriaceae", "g__Escherichia", "s__"]}
622             
623             //traverse the binLabels forming the metadata strings and saving them
624             //make sure to sanity check
625             map<string, string>::iterator it;
626             for (int i = 0; i < lookup[0]->getNumBins(); i++) {
627                 
628                 if (m->control_pressed) { return metadata; }
629                 
630                 it = labelTaxMap.find(m->getSimpleLabel(m->currentSharedBinLabels[i]));
631                 
632                 if (it == labelTaxMap.end()) { m->mothurOut("[ERROR]: can't find taxonomy information for " + m->currentSharedBinLabels[i] + ".\n"); m->control_pressed = true; }
633                 else {
634                     vector<string> bootstrapValues;
635                     string data = "{\"taxonomy\":[";
636             
637                     vector<string> scores;
638                     vector<string> taxonomies = parseTax(it->second, scores);
639                     
640                     for (int j = 0; j < taxonomies.size()-1; j ++) { data += "\"" + taxonomies[j] + "\", "; }
641                     data += "\"" + taxonomies[taxonomies.size()-1] + "\"]";
642                     
643                     //add bootstrap values if available
644                     if (scores[0] != "null") {
645                         data += ", \"bootstrap\":[";
646                         
647                         for (int j = 0; j < scores.size()-1; j ++) { data += scores[j] + ", "; }
648                         data += scores[scores.size()-1] + "]";
649
650                     }
651                     data += "}";
652                     
653                     metadata.push_back(data);
654                 }
655             }
656         }
657         
658         return metadata;
659         
660     }
661         catch(exception& e) {
662                 m->errorOut(e, "MakeBiomCommand", "getMetadata");
663                 exit(1);
664         }
665
666 }
667 //**********************************************************************************************************************
668 int MakeBiomCommand::getGreenGenesOTUIDs(vector<SharedRAbundVector*>& lookup, map<string, string>& labelTaxMap){
669         try {
670         //read reftaxonomy
671         PhyloTree phylo(referenceTax);
672         
673         //read otu map file
674         map<string, string> otuMap = readGGOtuMap(); //maps reference ID -> OTU ID
675         
676         if (m->control_pressed) { return 0; }
677         
678         map<string, vector<string> > ggOTUIDs;
679         //loop through otu taxonomies
680         for (map<string, string>::iterator it = labelTaxMap.begin(); it != labelTaxMap.end(); it++) { //maps label -> consensus taxonomy
681             if (m->control_pressed) { break; }
682             
683             string OTUTaxonomy = it->second;
684             
685             //remove confidences
686             m->removeConfidences(OTUTaxonomy);
687             
688             //remove unclassifieds to match template
689             int thisPos = OTUTaxonomy.find("unclassified;");
690             if (thisPos != string::npos) {  OTUTaxonomy = OTUTaxonomy.substr(0, thisPos);  }
691             
692             //get list of reference ids that map to this taxonomy
693             vector<string> referenceIds = phylo.getSeqs(OTUTaxonomy);
694             
695             if (m->control_pressed) { break; }
696             
697             //look for each one in otu map to find match
698             string otuID = "not found";
699             string referenceString = "";
700             for (int i = 0; i < referenceIds.size(); i++) {
701                 referenceString += referenceIds[i] + " ";
702                 map<string, string>::iterator itMap = otuMap.find(referenceIds[i]);
703                 if (itMap != otuMap.end()) { //found it
704                     otuID = itMap->second;
705                     i += referenceIds.size(); //stop looking
706                 }
707             }
708             
709             //if found, add otu to ggOTUID list
710             if (otuID != "not found") {
711                 map<string, vector<string> >::iterator itGG = ggOTUIDs.find(otuID);
712                 if (itGG == ggOTUIDs.end()) {
713                     vector<string> temp; temp.push_back(it->first); //save mothur OTU label
714                     ggOTUIDs[otuID] = temp;
715                 }else { ggOTUIDs[otuID].push_back(it->first); } //add mothur OTU label to list
716             }else {  m->mothurOut("[ERROR]: could not find OTUId for " + it->second + ". Its reference sequences are " + referenceString + ".\n"); m->control_pressed = true; }
717             
718         }
719         
720        
721         vector<SharedRAbundVector*> newLookup;
722                 for (int i = 0; i < lookup.size(); i++) {
723                         SharedRAbundVector* temp = new SharedRAbundVector();
724                         temp->setLabel(lookup[i]->getLabel());
725                         temp->setGroup(lookup[i]->getGroup());
726                         newLookup.push_back(temp);
727                 }
728                 
729         map<string, int> labelIndex;
730                 for (int i = 0; i < m->currentSharedBinLabels.size(); i++) {  labelIndex[m->getSimpleLabel(m->currentSharedBinLabels[i])] = i; }
731         
732         vector<string> newBinLabels;
733         map<string, string> newLabelTaxMap;
734         //loop through ggOTUID list combining mothur otus and adjusting labels
735         //ggOTUIDs = 16097 -> <OTU01, OTU10, OTU22>
736         
737         for (map<string, vector<string> >::iterator itMap = ggOTUIDs.begin(); itMap != ggOTUIDs.end(); itMap++) {
738                         if (m->control_pressed) { for (int j = 0; j < newLookup.size(); j++) {  delete newLookup[j];  } return 0; }
739             
740             //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
741             //find taxonomy of this otu
742             map<string, string>::iterator it = labelTaxMap.find(m->getSimpleLabel(itMap->second[0]));
743             vector<string> scores;
744             vector<string> taxonomies = parseTax(it->second, scores);
745             
746             //merge/set OTU abundances
747             vector<int> abunds; abunds.resize(lookup.size(), 0);
748             string mergeString = "";
749             vector<float> boots; boots.resize(scores.size(), 0);
750             bool scoresNULL = false;
751             for (int j = 0; j < itMap->second.size(); j++) { //<OTU01, OTU10, OTU22>
752                 
753                 if (scores[0] != "null") {
754                     //merge bootstrap scores
755                     vector<string> scores;
756                     vector<string> taxonomies = parseTax(it->second, scores);
757                     for (int i = 0; i < boots.size(); i++) {
758                         float tempScore; m->mothurConvert(scores[i], tempScore);
759                         boots[i] += tempScore;
760                     }
761                 }else { scoresNULL = true; }
762                 
763                 //merge abunds
764                 mergeString += (itMap->second)[j] + " ";
765                 for (int i = 0; i < lookup.size(); i++) {
766                     abunds[i] += lookup[i]->getAbundance(labelIndex[m->getSimpleLabel((itMap->second)[j])]);
767                 }
768             }
769             
770             if (m->debug) { m->mothurOut("[DEBUG]: merging " + mergeString + " for ggOTUid = " + itMap->first + ".\n");  }
771             
772             //average scores
773             //add merged otu to new lookup
774             string newTaxString = "";
775             if (!scoresNULL) {
776                 for (int j = 0; j < boots.size(); j++) { boots[j] /= (float) itMap->second.size(); }
777             
778                 //assemble new taxomoy
779                 for (int j = 0; j < boots.size(); j++) {
780                     newTaxString += taxonomies[j] + "(" + toString(boots[j]) + ");";
781                 }
782             }else {
783                 //assemble new taxomoy
784                 for (int j = 0; j < taxonomies.size(); j++) {
785                     newTaxString += taxonomies[j] + ";";
786                 }
787             }
788             
789             //set new gg otu id to taxonomy. OTU01 -> k__Bacteria becomes 16097 -> k__Bacteria
790             //find taxonomy of this otu
791             newLabelTaxMap[itMap->first] = newTaxString;
792             
793             //add merged otu to new lookup
794             for (int j = 0; j < abunds.size(); j++) { newLookup[j]->push_back(abunds[j], newLookup[j]->getGroup()); }
795             
796             //saved otu label
797             newBinLabels.push_back(itMap->first);
798         }
799                 
800                 for (int j = 0; j < lookup.size(); j++) {  delete lookup[j];  }
801                 
802                 lookup = newLookup;
803                 m->currentSharedBinLabels = newBinLabels;
804         labelTaxMap = newLabelTaxMap;
805         
806         map<string, string> variables;
807         variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile));
808         variables["[distance]"] = lookup[0]->getLabel();
809         string outputFileName = getOutputFileName("shared",variables);
810                 ofstream out;
811                 m->openOutputFile(outputFileName, out);
812                 outputNames.push_back(outputFileName); outputTypes["shared"].push_back(outputFileName);
813         
814         lookup[0]->printHeaders(out);
815                 
816                 for (int i = 0; i < lookup.size(); i++) {
817                         out << lookup[i]->getLabel() << '\t' << lookup[i]->getGroup() << '\t';
818                         lookup[i]->print(out);
819                 }
820                 out.close();
821
822         return 0;
823     }
824         catch(exception& e) {
825                 m->errorOut(e, "MakeBiomCommand", "getGreenGenesOTUIDs");
826                 exit(1);
827         }
828     
829 }
830 //**********************************************************************************************************************
831 map<string, string> MakeBiomCommand::readGGOtuMap(){
832         try {
833         map<string, string> otuMap;
834         
835         ifstream in;
836         m->openInputFile(picrustOtuFile, in);
837         
838         //map referenceIDs -> otuIDs
839         //lines look like:
840         //16097 671376  616121  533566  683683  4332909 4434717 772666  611808  695209
841         while(!in.eof()) {
842             if (m->control_pressed) { break; }
843             
844             string line = m->getline(in); m->gobble(in);
845             vector<string> pieces = m->splitWhiteSpace(line);
846             
847             if (pieces.size() != 0) {
848                 string otuID = pieces[1];
849                 for (int i = 1; i < pieces.size(); i++) {  otuMap[pieces[i]] = otuID; }
850             }
851         }
852         in.close();
853         
854         return otuMap;
855     }
856         catch(exception& e) {
857                 m->errorOut(e, "MakeBiomCommand", "readGGOtuMap");
858                 exit(1);
859         }
860     
861 }
862 //**********************************************************************************************************************
863 int MakeBiomCommand::getSampleMetaData(vector<SharedRAbundVector*>& lookup){
864         try {
865         sampleMetadata.clear();
866         if (metadatafile == "") {  for (int i = 0; i < lookup.size(); i++) {  sampleMetadata.push_back("null");  } }
867         else {
868             ifstream in;
869             m->openInputFile(metadatafile, in);
870             
871             vector<string> groupNames, metadataLabels;
872             map<string, vector<string> > lines;
873             
874             string headerLine = m->getline(in); m->gobble(in);
875             vector<string> pieces = m->splitWhiteSpace(headerLine);
876             
877             //save names of columns you are reading
878             for (int i = 1; i < pieces.size(); i++) {
879                 metadataLabels.push_back(pieces[i]);
880             }
881             int count = metadataLabels.size();
882                         
883             vector<string> groups = m->getGroups();
884             
885             //read rest of file
886             while (!in.eof()) {
887                 
888                 if (m->control_pressed) { in.close(); return 0; }
889                 
890                 string group = "";
891                 in >> group; m->gobble(in);
892                 groupNames.push_back(group);
893                 
894                 string line = m->getline(in); m->gobble(in);
895                 vector<string> thisPieces = m->splitWhiteSpaceWithQuotes(line);
896         
897                 if (thisPieces.size() != count) { m->mothurOut("[ERROR]: expected " + toString(count) + " items of data for sample " + group + " read " + toString(thisPieces.size()) + ", quitting.\n"); }
898                 else {  if (m->inUsersGroups(group, groups)) { lines[group] = thisPieces; } }
899                 
900                 m->gobble(in);
901             }
902             in.close();
903             
904             map<string, vector<string> >::iterator it;
905             for (int i = 0; i < lookup.size(); i++) {
906                 
907                 if (m->control_pressed) { return 0; }
908                 
909                 it = lines.find(lookup[i]->getGroup());
910                 
911                 if (it == lines.end()) { m->mothurOut("[ERROR]: can't find metadata information for " + lookup[i]->getGroup() + ", quitting.\n"); m->control_pressed = true; }
912                 else {
913                     vector<string> values = it->second;
914                     
915                     string data = "{";
916                     for (int j = 0; j < metadataLabels.size()-1; j++) { 
917                         values[j] = m->removeQuotes(values[j]); 
918                         data += "\"" + metadataLabels[j] + "\":\"" + values[j] + "\", "; 
919                     }
920                     values[metadataLabels.size()-1] = m->removeQuotes(values[metadataLabels.size()-1]);
921                     data += "\"" + metadataLabels[metadataLabels.size()-1] + "\":\"" + values[metadataLabels.size()-1] + "\"}";
922                     sampleMetadata.push_back(data);
923                 }
924             }
925         }
926         
927         return 0;
928         
929     }
930         catch(exception& e) {
931                 m->errorOut(e, "MakeBiomCommand", "getSampleMetaData");
932                 exit(1);
933         }
934     
935 }
936
937 /**************************************************************************************************/
938 //returns {Bacteria, Bacteroidetes, ..} and scores is filled with {100, 98, ...} or {null, null, null}
939 vector<string> MakeBiomCommand::parseTax(string tax, vector<string>& scores) {
940         try {
941                 
942                 string taxon;
943         vector<string> taxs;
944                 
945                 while (tax.find_first_of(';') != -1) {
946                         
947                         if (m->control_pressed) { return taxs; }
948                         
949                         //get taxon
950                         taxon = tax.substr(0,tax.find_first_of(';'));
951             
952                         int pos = taxon.find_last_of('(');
953                         if (pos != -1) {
954                                 //is it a number?
955                                 int pos2 = taxon.find_last_of(')');
956                                 if (pos2 != -1) {
957                                         string confidenceScore = taxon.substr(pos+1, (pos2-(pos+1)));
958                                         if (m->isNumeric1(confidenceScore)) {
959                                                 taxon = taxon.substr(0, pos); //rip off confidence 
960                         scores.push_back(confidenceScore);
961                                         }else{ scores.push_back("null"); }
962                                 }
963                         }else{ scores.push_back("null"); }
964                         
965             //strip "" if they are there
966             pos = taxon.find("\"");
967             if (pos != string::npos) {
968                 string newTax = "";
969                 for (int k = 0; k < taxon.length(); k++) {
970                     if (taxon[k] != '\"') { newTax += taxon[k]; }
971                 }
972                 taxon = newTax;
973             }
974             
975             //look for bootstrap value
976                         taxs.push_back(taxon);
977             tax = tax.substr(tax.find_first_of(';')+1, tax.length());
978                 }
979                 
980                 return taxs;
981         }
982         catch(exception& e) {
983                 m->errorOut(e, "MakeBiomCommand", "parseTax");
984                 exit(1);
985         }
986 }
987
988 //**********************************************************************************************************************
989
990
991