]> git.donarmstrong.com Git - mothur.git/blob - createdatabasecommand.cpp
removing chime source files from mother project.
[mothur.git] / createdatabasecommand.cpp
1 //
2 //  createdatabasecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/28/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "createdatabasecommand.h"
10 #include "inputdata.h"
11
12 //**********************************************************************************************************************
13 vector<string> CreateDatabaseCommand::setParameters(){  
14         try {
15                 CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
16                 CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
17                 CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcontaxonomy);
18                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
20                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
21                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "CreateDatabaseCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string CreateDatabaseCommand::getHelpString(){  
35         try {
36                 string helpString = "";
37                 helpString += "The create.database command reads a listfile, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n";
38                 helpString += "The create.database command parameters are repfasta, list, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n";
39         helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
40         helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
41         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n";
42         helpString += "The group file is optional and will just give you the abundance breakdown by group.\n";
43         helpString += "The label parameter allows you to specify a label to be used from your listfile.\n";
44         helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n";
45         helpString += "The create.database command should be in the following format: \n";
46                 helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n";       
47                 helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, name=final.an.0.03.rep.names, list=fina.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n";
48                 helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n";       
49                 return helpString;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "CreateDatabaseCommand", "getHelpString");
53                 exit(1);
54         }
55 }
56 //**********************************************************************************************************************
57 CreateDatabaseCommand::CreateDatabaseCommand(){ 
58         try {
59                 abort = true; calledHelp = true; 
60                 setParameters();
61                 vector<string> tempOutNames;
62                 outputTypes["database"] = tempOutNames;
63         }
64         catch(exception& e) {
65                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
66                 exit(1);
67         }
68 }
69
70 //**********************************************************************************************************************
71 CreateDatabaseCommand::CreateDatabaseCommand(string option)  {
72         try{
73                 abort = false; calledHelp = false;   
74         
75                 //allow user to run help
76                 if (option == "help") { 
77                         help(); abort = true; calledHelp = true;
78                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
79                 else {
80                         vector<string> myArray = setParameters();
81                         
82                         OptionParser parser(option);
83                         map<string, string> parameters = parser.getParameters();
84                         
85                         ValidParameters validParameter;
86                         map<string, string>::iterator it;
87             
88                         //check to make sure all parameters are valid for command
89                         for (it = parameters.begin(); it != parameters.end(); it++) { 
90                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
91                         }
92                         
93                         //initialize outputTypes
94                         vector<string> tempOutNames;
95                         outputTypes["database"] = tempOutNames;
96             
97                         //if the user changes the input directory command factory will send this info to us in the output parameter 
98                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
99                         if (inputDir == "not found"){   inputDir = "";          }
100                         else {
101                                 string path;
102                                 it = parameters.find("list");
103                                 //user has given a template file
104                                 if(it != parameters.end()){ 
105                                         path = m->hasPath(it->second);
106                                         //if the user has not given a path then, add inputdir. else leave path alone.
107                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
108                                 }
109                                 
110                                 it = parameters.find("repname");
111                                 //user has given a template file
112                                 if(it != parameters.end()){ 
113                                         path = m->hasPath(it->second);
114                                         //if the user has not given a path then, add inputdir. else leave path alone.
115                                         if (path == "") {       parameters["repname"] = inputDir + it->second;          }
116                                 }
117                                 
118                                 it = parameters.find("contaxonomy");
119                                 //user has given a template file
120                                 if(it != parameters.end()){ 
121                                         path = m->hasPath(it->second);
122                                         //if the user has not given a path then, add inputdir. else leave path alone.
123                                         if (path == "") {       parameters["contaxonomy"] = inputDir + it->second;              }
124                                 }
125                                 
126                                 it = parameters.find("repfasta");
127                                 //user has given a template file
128                                 if(it != parameters.end()){ 
129                                         path = m->hasPath(it->second);
130                                         //if the user has not given a path then, add inputdir. else leave path alone.
131                                         if (path == "") {       parameters["repfasta"] = inputDir + it->second;         }
132                                 }
133                                 
134                                 it = parameters.find("group");
135                                 //user has given a template file
136                                 if(it != parameters.end()){ 
137                                         path = m->hasPath(it->second);
138                                         //if the user has not given a path then, add inputdir. else leave path alone.
139                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
140                                 }
141                         }
142             
143                         
144                         //if the user changes the output directory command factory will send this info to us in the output parameter 
145                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
146                         
147                         //check for required parameters
148                         listfile = validParameter.validFile(parameters, "list", true);
149                         if (listfile == "not found") {                          
150                                 //if there is a current list file, use it
151                                 listfile = m->getListFile(); 
152                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
153                                 else {  m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
154                         }
155                         else if (listfile == "not open") { abort = true; }      
156                         else { m->setListFile(listfile); }
157                         
158                         contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
159                         if (contaxonomyfile == "not found") {  //if there is a current list file, use it
160                contaxonomyfile = "";  m->mothurOut("The contaxonomy parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
161                         }
162                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
163
164             repfastafile = validParameter.validFile(parameters, "repfasta", true);
165                         if (repfastafile == "not found") {  //if there is a current list file, use it
166                 repfastafile = "";  m->mothurOut("The repfasta parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
167                         }
168                         else if (repfastafile == "not open") { repfastafile = ""; abort = true; }
169
170             repnamesfile = validParameter.validFile(parameters, "repname", true);
171                         if (repnamesfile == "not found") {  //if there is a current list file, use it
172                 repnamesfile = "";  m->mothurOut("The repnames parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
173                         }
174                         else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; }
175
176                         groupfile = validParameter.validFile(parameters, "group", true);
177                         if (groupfile == "not open") { groupfile = ""; abort = true; }  
178                         else if (groupfile == "not found") { groupfile = ""; }
179                         else { m->setGroupFile(groupfile); }
180                         
181                         //check for optional parameter and set defaults
182                         // ...at some point should added some additional type checking...
183             label = validParameter.validFile(parameters, "label", false);                       
184                         if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your listfile.\n");}
185         }
186         }
187         catch(exception& e) {
188                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
189                 exit(1);
190         }
191 }
192 //**********************************************************************************************************************
193 int CreateDatabaseCommand::execute(){
194         try {
195                 
196                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
197         
198         //taxonomies holds the taxonomy info for each Otu
199         //classifyOtuSizes holds the size info of each Otu to help with error checking
200         vector<string> taxonomies;
201         vector<int> classifyOtuSizes = readTax(taxonomies);
202         
203         if (m->control_pressed) { return 0; }
204         
205         vector<Sequence> seqs;
206         vector<int> repOtusSizes = readFasta(seqs);
207         
208         if (m->control_pressed) { return 0; }
209         
210         //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map.
211         map<string, string> repNames;
212         int numUniqueNamesFile = readNames(repNames);
213         
214         //are there the same number of otus in the fasta and name files
215         if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
216         
217         if (m->control_pressed) { return 0; }
218         
219         //are there the same number of OTUs in the tax and fasta file
220         if (classifyOtuSizes.size() != repOtusSizes.size()) { m->mothurOut("[ERROR]: you have " + toString(classifyOtuSizes.size()) + " taxonomies in your contaxonomy file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
221
222         if (m->control_pressed) { return 0; }
223         
224         //at this point we have the same number of OTUs. Are the sizes we have found so far accurate?
225         for (int i = 0; i < classifyOtuSizes.size(); i++) {
226             if (classifyOtuSizes[i] != repOtusSizes[i]) {
227                m->mothurOut("[ERROR]: OTU size info does not match for bin " + toString(i+1) + ". The contaxonomy file indicated the OTU represented " + toString(classifyOtuSizes[i]) + " sequences, but the repfasta file had " + toString(repOtusSizes[i]) + ".  These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; 
228             }
229         }
230         
231         if (m->control_pressed) { return 0; }
232         
233         //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile
234         ListVector* list = getList();
235         
236         if (m->control_pressed) { delete list; return 0; }
237         
238         GroupMap* groupmap = NULL;
239         if (groupfile != "") {
240                         groupmap = new GroupMap(groupfile);
241                         groupmap->readMap();
242                 }
243         
244         if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; }
245         
246         if (outputDir == "") { outputDir += m->hasPath(listfile); }
247         string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + "database";
248         outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName);
249         
250         ofstream out;
251         m->openOutputFile(outputFileName, out);
252         
253         string header = "OTUNumber\tAbundance\t";
254         if (groupfile != "") { 
255             header = "OTUNumber\t";
256             for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; }
257         }
258         header += "repSeqName\trepSeq\tOTUConTaxonomy";
259         out << header << endl;
260         
261         for (int i = 0; i < list->getNumBins(); i++) {
262             
263             if (m->control_pressed) { break; }
264             
265             out << (i+1) << '\t';
266             
267             vector<string> binNames;
268             string bin = list->get(i);
269             
270             map<string, string>::iterator it = repNames.find(bin);
271             if (it == repNames.end()) {
272                 m->mothurOut("[ERROR: OTU " + toString(i+1) + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
273             }
274             
275             m->splitAtComma(bin, binNames);
276             
277             //sanity check
278             if (binNames.size() != classifyOtuSizes[i]) {
279                  m->mothurOut("[ERROR: OTU " + toString(i+1) + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
280             }
281             
282             //output abundances
283             if (groupfile != "") {
284                 string groupAbunds = "";
285                 map<string, int> counts;
286                 //initialize counts to 0
287                 for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; }
288                 
289                 //find abundances by group
290                 bool error = false;
291                 for (int j = 0; j < binNames.size(); j++) {
292                     string group = groupmap->getGroup(binNames[j]);
293                     if (group == "not found") {
294                         m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n");
295                         error = true;
296                     }else { counts[group]++; }
297                 }
298                 
299                 //output counts
300                 for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t';  }
301                 
302                 if (error) { m->control_pressed = true; }
303             }else { out << binNames.size() << '\t'; }
304             
305             //output repSeq
306             out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl;
307         }
308         out.close();
309         
310         delete list;
311         if (groupfile != "") { delete groupmap; }
312         
313         if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
314         
315         m->mothurOutEndLine();
316                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
317                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
318                 m->mothurOutEndLine();
319         
320         return 0;
321         
322     }
323         catch(exception& e) {
324                 m->errorOut(e, "CreateDatabaseCommand", "execute");
325                 exit(1);
326         }
327 }
328 //**********************************************************************************************************************
329 vector<int> CreateDatabaseCommand::readTax(vector<string>& taxonomies){
330         try {
331                 
332         vector<int> sizes; 
333         
334         ifstream in;
335         m->openInputFile(contaxonomyfile, in);
336         
337         //read headers
338         m->getline(in);
339         
340         while (!in.eof()) {
341             
342             if (m->control_pressed) { break; }
343             
344             string otu = ""; string tax = "unknown";
345             int size = 0;
346             
347             in >> otu >> size >> tax; m->gobble(in);
348             
349             sizes.push_back(size);
350             taxonomies.push_back(tax);
351         }
352         in.close();
353         
354         return sizes;
355     }
356         catch(exception& e) {
357                 m->errorOut(e, "CreateDatabaseCommand", "readTax");
358                 exit(1);
359         }
360 }
361 //**********************************************************************************************************************
362 vector<int> CreateDatabaseCommand::readFasta(vector<Sequence>& seqs){
363         try {
364                 
365         vector<int> sizes; 
366         
367         ifstream in;
368         m->openInputFile(repfastafile, in);
369         
370         while (!in.eof()) {
371             
372             if (m->control_pressed) { break; }
373             
374             string binInfo;
375             Sequence seq(in, binInfo, true);  m->gobble(in);
376             
377             //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file
378             vector<string> info;
379             m->splitAtChar(binInfo, info, '|');
380             if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;}
381             
382             int size = 0;
383             m->mothurConvert(info[1], size);
384             
385             sizes.push_back(size);
386             seqs.push_back(seq);
387         }
388         in.close();
389         
390         return sizes;
391     }
392         catch(exception& e) {
393                 m->errorOut(e, "CreateDatabaseCommand", "readFasta");
394                 exit(1);
395         }
396 }
397 /**********************************************************************************************************************/
398 int CreateDatabaseCommand::readNames(map<string, string>& nameMap) { 
399         try {
400                 
401                 //open input file
402                 ifstream in;
403                 m->openInputFile(repnamesfile, in);
404                 
405                 while (!in.eof()) {
406                         if (m->control_pressed) { break; }
407                         
408                         string firstCol, secondCol;
409                         in >> firstCol >> secondCol; m->gobble(in);
410                         
411                         nameMap[secondCol] = firstCol;
412                 }
413                 in.close();
414                 
415                 return nameMap.size();
416                 
417         }
418         catch(exception& e) {
419                 m->errorOut(e, "CreateDatabaseCommand", "readNames");
420                 exit(1);
421         }
422 }
423 //**********************************************************************************************************************
424 ListVector* CreateDatabaseCommand::getList(){
425         try {
426                 InputData* input = new InputData(listfile, "list");
427                 ListVector* list = input->getListVector();
428                 string lastLabel = list->getLabel();
429                 
430                 if (label == "") { label = lastLabel; delete input; return list; }
431                 
432                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
433                 set<string> labels; labels.insert(label);
434                 set<string> processedLabels;
435                 set<string> userLabels = labels;
436                 
437                 //as long as you are not at the end of the file or done wih the lines you want
438                 while((list != NULL) && (userLabels.size() != 0)) {
439                         if (m->control_pressed) {  delete input; return list;  }
440                         
441                         if(labels.count(list->getLabel()) == 1){
442                                 processedLabels.insert(list->getLabel());
443                                 userLabels.erase(list->getLabel());
444                                 break;
445                         }
446                         
447                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
448                                 string saveLabel = list->getLabel();
449                                 
450                                 delete list;
451                                 list = input->getListVector(lastLabel);
452                                 
453                                 processedLabels.insert(list->getLabel());
454                                 userLabels.erase(list->getLabel());
455                                 
456                                 //restore real lastlabel to save below
457                                 list->setLabel(saveLabel);
458                                 break;
459                         }
460                         
461                         lastLabel = list->getLabel();                   
462                         
463                         //get next line to process
464                         //prevent memory leak
465                         delete list; 
466                         list = input->getListVector();
467                 }
468                 
469                 
470                 if (m->control_pressed) { delete input; return list;  }
471                 
472                 //output error messages about any remaining user labels
473                 set<string>::iterator it;
474                 bool needToRun = false;
475                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
476                         m->mothurOut("Your file does not include the label " + *it); 
477                         if (processedLabels.count(lastLabel) != 1) {
478                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
479                                 needToRun = true;
480                         }else {
481                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
482                         }
483                 }
484                 
485                 //run last label if you need to
486                 if (needToRun == true)  {
487                         delete list;
488                         list = input->getListVector(lastLabel);
489                 }       
490                 
491                 delete input;
492
493         return list;
494     }
495         catch(exception& e) {
496                 m->errorOut(e, "CreateDatabaseCommand", "getList");
497                 exit(1);
498         }
499 }
500 //**********************************************************************************************************************
501
502