]> git.donarmstrong.com Git - mothur.git/blob - createdatabasecommand.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / createdatabasecommand.cpp
1 //
2 //  createdatabasecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/28/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "createdatabasecommand.h"
10 #include "inputdata.h"
11
12 //**********************************************************************************************************************
13 vector<string> CreateDatabaseCommand::setParameters(){  
14         try {
15                 CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
16                 CommandParameter pname("repname", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pname);
17                 CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pcontaxonomy);
18                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
19                 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
20                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
21                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
22                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
23                 
24                 vector<string> myArray;
25                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
26                 return myArray;
27         }
28         catch(exception& e) {
29                 m->errorOut(e, "CreateDatabaseCommand", "setParameters");
30                 exit(1);
31         }
32 }
33 //**********************************************************************************************************************
34 string CreateDatabaseCommand::getHelpString(){  
35         try {
36                 string helpString = "";
37                 helpString += "The create.database command reads a listfile, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, and creates a database file.\n";
38                 helpString += "The create.database command parameters are repfasta, list, repname, contaxonomy, group and label. List, repfasta, repnames, and contaxonomy are required.\n";
39         helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
40         helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
41         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile).\n";
42         helpString += "The group file is optional and will just give you the abundance breakdown by group.\n";
43         helpString += "The label parameter allows you to specify a label to be used from your listfile.\n";
44         helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n";
45         helpString += "The create.database command should be in the following format: \n";
46                 helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n";       
47                 helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, name=final.an.0.03.rep.names, list=fina.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n";
48                 helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n";       
49                 return helpString;
50         }
51         catch(exception& e) {
52                 m->errorOut(e, "CreateDatabaseCommand", "getHelpString");
53                 exit(1);
54         }
55 }
56 //**********************************************************************************************************************
57 string CreateDatabaseCommand::getOutputFileNameTag(string type, string inputName=""){   
58         try {
59         string outputFileName = "";
60                 map<string, vector<string> >::iterator it;
61         
62         //is this a type this command creates
63         it = outputTypes.find(type);
64         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
65         else {
66             if (type == "database") {  outputFileName =  "database"; }
67             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
68         }
69         return outputFileName;
70         }
71         catch(exception& e) {
72                 m->errorOut(e, "CreateDatabaseCommand", "getOutputFileNameTag");
73                 exit(1);
74         }
75 }
76
77 //**********************************************************************************************************************
78 CreateDatabaseCommand::CreateDatabaseCommand(){ 
79         try {
80                 abort = true; calledHelp = true; 
81                 setParameters();
82                 vector<string> tempOutNames;
83                 outputTypes["database"] = tempOutNames;
84         }
85         catch(exception& e) {
86                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
87                 exit(1);
88         }
89 }
90
91 //**********************************************************************************************************************
92 CreateDatabaseCommand::CreateDatabaseCommand(string option)  {
93         try{
94                 abort = false; calledHelp = false;   
95         
96                 //allow user to run help
97                 if (option == "help") { 
98                         help(); abort = true; calledHelp = true;
99                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
100                 else {
101                         vector<string> myArray = setParameters();
102                         
103                         OptionParser parser(option);
104                         map<string, string> parameters = parser.getParameters();
105                         
106                         ValidParameters validParameter;
107                         map<string, string>::iterator it;
108             
109                         //check to make sure all parameters are valid for command
110                         for (it = parameters.begin(); it != parameters.end(); it++) { 
111                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
112                         }
113                         
114                         //initialize outputTypes
115                         vector<string> tempOutNames;
116                         outputTypes["database"] = tempOutNames;
117             
118                         //if the user changes the input directory command factory will send this info to us in the output parameter 
119                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
120                         if (inputDir == "not found"){   inputDir = "";          }
121                         else {
122                                 string path;
123                                 it = parameters.find("list");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
129                                 }
130                                 
131                                 it = parameters.find("repname");
132                                 //user has given a template file
133                                 if(it != parameters.end()){ 
134                                         path = m->hasPath(it->second);
135                                         //if the user has not given a path then, add inputdir. else leave path alone.
136                                         if (path == "") {       parameters["repname"] = inputDir + it->second;          }
137                                 }
138                                 
139                                 it = parameters.find("contaxonomy");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["contaxonomy"] = inputDir + it->second;              }
145                                 }
146                                 
147                                 it = parameters.find("repfasta");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["repfasta"] = inputDir + it->second;         }
153                                 }
154                                 
155                                 it = parameters.find("group");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
161                                 }
162                         }
163             
164                         
165                         //if the user changes the output directory command factory will send this info to us in the output parameter 
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
167                         
168                         //check for required parameters
169                         listfile = validParameter.validFile(parameters, "list", true);
170                         if (listfile == "not found") {                          
171                                 //if there is a current list file, use it
172                                 listfile = m->getListFile(); 
173                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
174                                 else {  m->mothurOut("You have no current listfile and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
175                         }
176                         else if (listfile == "not open") { abort = true; }      
177                         else { m->setListFile(listfile); }
178                         
179                         contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
180                         if (contaxonomyfile == "not found") {  //if there is a current list file, use it
181                contaxonomyfile = "";  m->mothurOut("The contaxonomy parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
182                         }
183                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
184
185             repfastafile = validParameter.validFile(parameters, "repfasta", true);
186                         if (repfastafile == "not found") {  //if there is a current list file, use it
187                 repfastafile = "";  m->mothurOut("The repfasta parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
188                         }
189                         else if (repfastafile == "not open") { repfastafile = ""; abort = true; }
190
191             repnamesfile = validParameter.validFile(parameters, "repname", true);
192                         if (repnamesfile == "not found") {  //if there is a current list file, use it
193                 repnamesfile = "";  m->mothurOut("The repnames parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
194                         }
195                         else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; }
196
197                         groupfile = validParameter.validFile(parameters, "group", true);
198                         if (groupfile == "not open") { groupfile = ""; abort = true; }  
199                         else if (groupfile == "not found") { groupfile = ""; }
200                         else { m->setGroupFile(groupfile); }
201                         
202                         //check for optional parameter and set defaults
203                         // ...at some point should added some additional type checking...
204             label = validParameter.validFile(parameters, "label", false);                       
205                         if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your listfile.\n");}
206         }
207         }
208         catch(exception& e) {
209                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
210                 exit(1);
211         }
212 }
213 //**********************************************************************************************************************
214 int CreateDatabaseCommand::execute(){
215         try {
216                 
217                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
218         
219         //taxonomies holds the taxonomy info for each Otu
220         //classifyOtuSizes holds the size info of each Otu to help with error checking
221         vector<string> taxonomies;
222         vector<int> classifyOtuSizes = readTax(taxonomies);
223         
224         if (m->control_pressed) { return 0; }
225         
226         vector<Sequence> seqs;
227         vector<int> repOtusSizes = readFasta(seqs);
228         
229         if (m->control_pressed) { return 0; }
230         
231         //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map.
232         map<string, string> repNames;
233         int numUniqueNamesFile = m->readNames(repnamesfile, repNames);
234         
235         //are there the same number of otus in the fasta and name files
236         if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
237         
238         if (m->control_pressed) { return 0; }
239         
240         //are there the same number of OTUs in the tax and fasta file
241         if (classifyOtuSizes.size() != repOtusSizes.size()) { m->mothurOut("[ERROR]: you have " + toString(classifyOtuSizes.size()) + " taxonomies in your contaxonomy file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
242
243         if (m->control_pressed) { return 0; }
244         
245         //at this point we have the same number of OTUs. Are the sizes we have found so far accurate?
246         for (int i = 0; i < classifyOtuSizes.size(); i++) {
247             if (classifyOtuSizes[i] != repOtusSizes[i]) {
248                m->mothurOut("[ERROR]: OTU size info does not match for bin " + toString(i+1) + ". The contaxonomy file indicated the OTU represented " + toString(classifyOtuSizes[i]) + " sequences, but the repfasta file had " + toString(repOtusSizes[i]) + ".  These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; 
249             }
250         }
251         
252         if (m->control_pressed) { return 0; }
253         
254         //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile
255         ListVector* list = getList();
256         
257         if (m->control_pressed) { delete list; return 0; }
258         
259         GroupMap* groupmap = NULL;
260         if (groupfile != "") {
261                         groupmap = new GroupMap(groupfile);
262                         groupmap->readMap();
263                 }
264         
265         if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; }
266         
267         if (outputDir == "") { outputDir += m->hasPath(listfile); }
268         string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + getOutputFileNameTag("database");
269         outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName);
270         
271         ofstream out;
272         m->openOutputFile(outputFileName, out);
273         
274         string header = "OTUNumber\tAbundance\t";
275         if (groupfile != "") { 
276             header = "OTUNumber\t";
277             for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; }
278         }
279         header += "repSeqName\trepSeq\tOTUConTaxonomy";
280         out << header << endl;
281         
282         for (int i = 0; i < list->getNumBins(); i++) {
283             
284             if (m->control_pressed) { break; }
285             
286             out << (i+1) << '\t';
287             
288             vector<string> binNames;
289             string bin = list->get(i);
290             
291             map<string, string>::iterator it = repNames.find(bin);
292             if (it == repNames.end()) {
293                 m->mothurOut("[ERROR: OTU " + toString(i+1) + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
294             }
295             
296             m->splitAtComma(bin, binNames);
297             
298             //sanity check
299             if (binNames.size() != classifyOtuSizes[i]) {
300                  m->mothurOut("[ERROR: OTU " + toString(i+1) + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
301             }
302             
303             //output abundances
304             if (groupfile != "") {
305                 string groupAbunds = "";
306                 map<string, int> counts;
307                 //initialize counts to 0
308                 for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; }
309                 
310                 //find abundances by group
311                 bool error = false;
312                 for (int j = 0; j < binNames.size(); j++) {
313                     string group = groupmap->getGroup(binNames[j]);
314                     if (group == "not found") {
315                         m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n");
316                         error = true;
317                     }else { counts[group]++; }
318                 }
319                 
320                 //output counts
321                 for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t';  }
322                 
323                 if (error) { m->control_pressed = true; }
324             }else { out << binNames.size() << '\t'; }
325             
326             //output repSeq
327             out << it->second << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl;
328         }
329         out.close();
330         
331         delete list;
332         if (groupfile != "") { delete groupmap; }
333         
334         if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
335         
336         m->mothurOutEndLine();
337                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
338                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
339                 m->mothurOutEndLine();
340         
341         return 0;
342         
343     }
344         catch(exception& e) {
345                 m->errorOut(e, "CreateDatabaseCommand", "execute");
346                 exit(1);
347         }
348 }
349 //**********************************************************************************************************************
350 vector<int> CreateDatabaseCommand::readTax(vector<string>& taxonomies){
351         try {
352                 
353         vector<int> sizes; 
354         
355         ifstream in;
356         m->openInputFile(contaxonomyfile, in);
357         
358         //read headers
359         m->getline(in);
360         
361         while (!in.eof()) {
362             
363             if (m->control_pressed) { break; }
364             
365             string otu = ""; string tax = "unknown";
366             int size = 0;
367             
368             in >> otu >> size >> tax; m->gobble(in);
369             
370             sizes.push_back(size);
371             taxonomies.push_back(tax);
372         }
373         in.close();
374         
375         return sizes;
376     }
377         catch(exception& e) {
378                 m->errorOut(e, "CreateDatabaseCommand", "readTax");
379                 exit(1);
380         }
381 }
382 //**********************************************************************************************************************
383 vector<int> CreateDatabaseCommand::readFasta(vector<Sequence>& seqs){
384         try {
385                 
386         vector<int> sizes; 
387         
388         ifstream in;
389         m->openInputFile(repfastafile, in);
390         
391         while (!in.eof()) {
392             
393             if (m->control_pressed) { break; }
394             
395             string binInfo;
396             Sequence seq(in, binInfo, true);  m->gobble(in);
397             
398             //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file
399             vector<string> info;
400             m->splitAtChar(binInfo, info, '|');
401             if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;}
402             
403             int size = 0;
404             m->mothurConvert(info[1], size);
405             
406             sizes.push_back(size);
407             seqs.push_back(seq);
408         }
409         in.close();
410         
411         return sizes;
412     }
413         catch(exception& e) {
414                 m->errorOut(e, "CreateDatabaseCommand", "readFasta");
415                 exit(1);
416         }
417 }
418 //**********************************************************************************************************************
419 ListVector* CreateDatabaseCommand::getList(){
420         try {
421                 InputData* input = new InputData(listfile, "list");
422                 ListVector* list = input->getListVector();
423                 string lastLabel = list->getLabel();
424                 
425                 if (label == "") { label = lastLabel; delete input; return list; }
426                 
427                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
428                 set<string> labels; labels.insert(label);
429                 set<string> processedLabels;
430                 set<string> userLabels = labels;
431                 
432                 //as long as you are not at the end of the file or done wih the lines you want
433                 while((list != NULL) && (userLabels.size() != 0)) {
434                         if (m->control_pressed) {  delete input; return list;  }
435                         
436                         if(labels.count(list->getLabel()) == 1){
437                                 processedLabels.insert(list->getLabel());
438                                 userLabels.erase(list->getLabel());
439                                 break;
440                         }
441                         
442                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
443                                 string saveLabel = list->getLabel();
444                                 
445                                 delete list;
446                                 list = input->getListVector(lastLabel);
447                                 
448                                 processedLabels.insert(list->getLabel());
449                                 userLabels.erase(list->getLabel());
450                                 
451                                 //restore real lastlabel to save below
452                                 list->setLabel(saveLabel);
453                                 break;
454                         }
455                         
456                         lastLabel = list->getLabel();                   
457                         
458                         //get next line to process
459                         //prevent memory leak
460                         delete list; 
461                         list = input->getListVector();
462                 }
463                 
464                 
465                 if (m->control_pressed) { delete input; return list;  }
466                 
467                 //output error messages about any remaining user labels
468                 set<string>::iterator it;
469                 bool needToRun = false;
470                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
471                         m->mothurOut("Your file does not include the label " + *it); 
472                         if (processedLabels.count(lastLabel) != 1) {
473                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
474                                 needToRun = true;
475                         }else {
476                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
477                         }
478                 }
479                 
480                 //run last label if you need to
481                 if (needToRun == true)  {
482                         delete list;
483                         list = input->getListVector(lastLabel);
484                 }       
485                 
486                 delete input;
487
488         return list;
489     }
490         catch(exception& e) {
491                 m->errorOut(e, "CreateDatabaseCommand", "getList");
492                 exit(1);
493         }
494 }
495 //**********************************************************************************************************************
496
497