]> git.donarmstrong.com Git - mothur.git/blob - createdatabasecommand.cpp
changed random forest output filename
[mothur.git] / createdatabasecommand.cpp
1 //
2 //  createdatabasecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/28/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "createdatabasecommand.h"
10 #include "inputdata.h"
11
12 //**********************************************************************************************************************
13 vector<string> CreateDatabaseCommand::setParameters(){  
14         try {
15                 CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none","database",false,true,true); parameters.push_back(pfasta);
16                 CommandParameter pname("repname", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname);
17         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
19                 CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pcontaxonomy);
20                 CommandParameter plist("list", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(plist);
21         CommandParameter pshared("shared", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(pshared);
22                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
23                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "CreateDatabaseCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string CreateDatabaseCommand::getHelpString(){  
37         try {
38                 string helpString = "";
39                 helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, or count file and creates a database file.\n";
40                 helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group, count and label. List, repfasta, repnames or count, and contaxonomy are required.\n";
41         helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
42         helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
43         helpString += "The count file is the count file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, count=yourCountFile). If it includes group info, mothur will give you the abundance breakdown by group. \n";
44         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile, name=yourNameFile).\n";
45         helpString += "The group file is optional and will just give you the abundance breakdown by group.\n";
46         helpString += "The label parameter allows you to specify a label to be used from your listfile.\n";
47         helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n";
48         helpString += "The create.database command should be in the following format: \n";
49                 helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n";       
50                 helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, repname=final.an.0.03.rep.names, list=final.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n";
51                 helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n";       
52                 return helpString;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "CreateDatabaseCommand", "getHelpString");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string CreateDatabaseCommand::getOutputPattern(string type) {
61     try {
62         string pattern = "";
63         
64         if (type == "database") {  pattern = "[filename],database"; }
65         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
66         
67         return pattern;
68     }
69     catch(exception& e) {
70         m->errorOut(e, "CreateDatabaseCommand", "getOutputPattern");
71         exit(1);
72     }
73 }
74 //**********************************************************************************************************************
75 CreateDatabaseCommand::CreateDatabaseCommand(){ 
76         try {
77                 abort = true; calledHelp = true; 
78                 setParameters();
79                 vector<string> tempOutNames;
80                 outputTypes["database"] = tempOutNames;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
84                 exit(1);
85         }
86 }
87
88 //**********************************************************************************************************************
89 CreateDatabaseCommand::CreateDatabaseCommand(string option)  {
90         try{
91                 abort = false; calledHelp = false;   
92         
93                 //allow user to run help
94                 if (option == "help") { 
95                         help(); abort = true; calledHelp = true;
96                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
97                 else {
98                         vector<string> myArray = setParameters();
99                         
100                         OptionParser parser(option);
101                         map<string, string> parameters = parser.getParameters();
102                         
103                         ValidParameters validParameter;
104                         map<string, string>::iterator it;
105             
106                         //check to make sure all parameters are valid for command
107                         for (it = parameters.begin(); it != parameters.end(); it++) { 
108                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
109                         }
110                         
111                         //initialize outputTypes
112                         vector<string> tempOutNames;
113                         outputTypes["database"] = tempOutNames;
114             
115                         //if the user changes the input directory command factory will send this info to us in the output parameter 
116                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
117                         if (inputDir == "not found"){   inputDir = "";          }
118                         else {
119                                 string path;
120                                 it = parameters.find("list");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
126                                 }
127                                 
128                                 it = parameters.find("repname");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["repname"] = inputDir + it->second;          }
134                                 }
135                                 
136                                 it = parameters.find("contaxonomy");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["contaxonomy"] = inputDir + it->second;              }
142                                 }
143                                 
144                                 it = parameters.find("repfasta");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["repfasta"] = inputDir + it->second;         }
150                                 }
151                                 
152                                 it = parameters.find("group");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
158                                 }
159                 
160                 it = parameters.find("count");
161                                 //user has given a template file
162                                 if(it != parameters.end()){
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
166                                 }
167                 
168                 it = parameters.find("shared");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
174                                 }
175                         }
176             
177                         
178                         //if the user changes the output directory command factory will send this info to us in the output parameter 
179                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
180                         
181                         //check for required parameters
182                         listfile = validParameter.validFile(parameters, "list", true);
183                         if (listfile == "not found") {  listfile = "";                  }
184                         else if (listfile == "not open") { listfile = ""; abort = true; }       
185                         else { m->setListFile(listfile); }
186             
187             sharedfile = validParameter.validFile(parameters, "shared", true);
188                         if (sharedfile == "not found") {        sharedfile = "";                        }
189                         else if (sharedfile == "not open") { sharedfile = ""; abort = true; }   
190                         else { m->setSharedFile(sharedfile); }
191             
192             if ((sharedfile == "") && (listfile == "")) { 
193                                 //is there are current file available for either of these?
194                                 //give priority to list, then shared
195                                 listfile = m->getListFile(); 
196                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
197                                 else { 
198                                         sharedfile = m->getSharedFile(); 
199                                         if (sharedfile != "") {  m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
200                                         else { 
201                                                 m->mothurOut("No valid current files. You must provide a shared or list file before you can use the create.database command."); m->mothurOutEndLine(); 
202                                                 abort = true;
203                                         }
204                                 }
205                         }
206                         else if ((sharedfile != "") && (listfile != "")) { m->mothurOut("When executing a create.database command you must enter ONLY ONE of the following: shared or list."); m->mothurOutEndLine(); abort = true; }
207             
208             if (sharedfile != "") { if (outputDir == "") { outputDir = m->hasPath(sharedfile); } }
209             else { if (outputDir == "") { outputDir = m->hasPath(listfile); } }
210                         
211                         contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
212                         if (contaxonomyfile == "not found") {  //if there is a current list file, use it
213                contaxonomyfile = "";  m->mothurOut("The contaxonomy parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
214                         }
215                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
216
217             repfastafile = validParameter.validFile(parameters, "repfasta", true);
218                         if (repfastafile == "not found") {  //if there is a current list file, use it
219                 repfastafile = "";  m->mothurOut("The repfasta parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
220                         }
221                         else if (repfastafile == "not open") { repfastafile = ""; abort = true; }
222
223             repnamesfile = validParameter.validFile(parameters, "repname", true);
224                         if (repnamesfile == "not found") {  repnamesfile = "";                          }
225                         else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; }
226             
227             countfile = validParameter.validFile(parameters, "count", true);
228                         if (countfile == "not found") {  countfile = "";                        }
229                         else if (countfile == "not open") { countfile = ""; abort = true; }
230             
231             if ((countfile == "") && (repnamesfile == "")) {
232                 //if there is a current name file, use it, else look for current count file
233                 string repnamesfile = m->getNameFile();
234                                 if (repnamesfile != "") {  m->mothurOut("Using " + repnamesfile + " as input file for the repname parameter."); m->mothurOutEndLine(); }
235                                 else {
236                     countfile = m->getCountTableFile();
237                     if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
238                     else { m->mothurOut("[ERROR]: You must provide a count or repname file."); m->mothurOutEndLine(); abort = true;  }
239                 }
240             }
241
242                         groupfile = validParameter.validFile(parameters, "group", true);
243                         if (groupfile == "not open") { groupfile = ""; abort = true; }  
244                         else if (groupfile == "not found") { groupfile = ""; }
245                         else { m->setGroupFile(groupfile); }
246                         
247                         //check for optional parameter and set defaults
248                         // ...at some point should added some additional type checking...
249             label = validParameter.validFile(parameters, "label", false);                       
250                         if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your listfile.\n");}
251         }
252         }
253         catch(exception& e) {
254                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
255                 exit(1);
256         }
257 }
258 //**********************************************************************************************************************
259 int CreateDatabaseCommand::execute(){
260         try {
261                 
262                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
263         
264         //taxonomies holds the taxonomy info for each Otu
265         //classifyOtuSizes holds the size info of each Otu to help with error checking
266         vector<string> taxonomies;
267         vector<string> otuLabels;
268         vector<int> classifyOtuSizes = readTax(taxonomies, otuLabels);
269         
270         if (m->control_pressed) { return 0; }
271         
272         vector<Sequence> seqs;
273         vector<int> repOtusSizes = readFasta(seqs);
274         
275         if (m->control_pressed) { return 0; }
276         
277         //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map.
278         map<string, string> repNames;
279         map<string, int> nameMap;
280         int numUniqueNamesFile = 0;
281         CountTable ct;
282         if (countfile == "") {
283             numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1);
284             //the repnames file does not have the same order as the list file bins so we need to sort and reassemble for the search below
285             map<string, string> tempRepNames;
286             for (map<string, string>::iterator it = repNames.begin(); it != repNames.end();) {
287                 string bin = it->first;
288                 vector<string> temp;
289                 m->splitAtChar(bin, temp, ',');
290                 sort(temp.begin(), temp.end());
291                 bin = "";
292                 for (int i = 0; i < temp.size()-1; i++) {
293                     bin += temp[i] + ',';
294                 }
295                 bin += temp[temp.size()-1];
296                 tempRepNames[bin] = it->second;
297                 repNames.erase(it++);
298             }
299             repNames = tempRepNames;
300         }else {
301             ct.readTable(countfile, true);
302             numUniqueNamesFile = ct.getNumUniqueSeqs();
303             nameMap = ct.getNameMap();
304         }
305         
306         //are there the same number of otus in the fasta and name files
307         if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
308         
309         if (m->control_pressed) { return 0; }
310         
311         //are there the same number of OTUs in the tax and fasta file
312         if (classifyOtuSizes.size() != repOtusSizes.size()) { m->mothurOut("[ERROR]: you have " + toString(classifyOtuSizes.size()) + " taxonomies in your contaxonomy file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
313
314         if (m->control_pressed) { return 0; }
315         
316         //at this point we have the same number of OTUs. Are the sizes we have found so far accurate?
317         for (int i = 0; i < classifyOtuSizes.size(); i++) {
318             if (classifyOtuSizes[i] != repOtusSizes[i]) {
319                m->mothurOut("[ERROR]: OTU size info does not match for bin " + toString(i+1) + ". The contaxonomy file indicated the OTU represented " + toString(classifyOtuSizes[i]) + " sequences, but the repfasta file had " + toString(repOtusSizes[i]) + ".  These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; 
320             }
321         }
322         
323         if (m->control_pressed) { return 0; }
324         
325         
326         map<string, string> variables; 
327         if (listfile != "") {  variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); }
328         else { variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); }
329         string outputFileName = getOutputFileName("database", variables); 
330         outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName);
331         
332         ofstream out;
333         m->openOutputFile(outputFileName, out);
334         
335         string header = "OTUNumber\tAbundance\t";
336
337         
338         if (listfile != "") {
339             //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile
340             ListVector* list = getList();
341             
342             if (otuLabels.size() != list->getNumBins()) { 
343                 m->mothurOut("[ERROR]: you have " + toString(otuLabels.size()) + " otus in your contaxonomy file, but your list file has " + toString(list->getNumBins()) + " otus. These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true;  }
344             
345             if (m->control_pressed) { delete list; return 0; }
346             
347             GroupMap* groupmap = NULL;
348             if (groupfile != "") {
349                 groupmap = new GroupMap(groupfile);
350                 groupmap->readMap();
351             }
352             
353             if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; }
354             
355             if (groupfile != "") { 
356                 header = "OTUNumber\t";
357                 for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; }
358             }else if (countfile != "") {
359                 if (ct.hasGroupInfo()) {
360                     header = "OTUNumber\t";
361                     for (int i = 0; i < ct.getNamesOfGroups().size(); i++) { header += (ct.getNamesOfGroups())[i] + '\t'; }
362                 }
363             }
364             header += "repSeqName\trepSeq\tOTUConTaxonomy";
365             out << header << endl;
366             
367             for (int i = 0; i < list->getNumBins(); i++) {
368                 
369                 if (m->control_pressed) { break; }
370                 
371                 out << otuLabels[i] << '\t';
372                 
373                 vector<string> binNames;
374                 string bin = list->get(i);
375                 m->splitAtComma(bin, binNames);
376                 
377                 string seqRepName = "";
378                 int numSeqsRep = 0;
379                 
380                 if (countfile == "") {
381                     sort(binNames.begin(), binNames.end());
382                     bin = "";
383                     for (int j = 0; j < binNames.size()-1; j++) {
384                         bin += binNames[j] + ',';
385                     }
386                     bin += binNames[binNames.size()-1];
387                     map<string, string>::iterator it = repNames.find(bin);
388                     
389                     if (it == repNames.end()) {
390                         m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
391                     }else { seqRepName = it->second;  numSeqsRep = binNames.size(); }
392                     
393                     //sanity check
394                     if (binNames.size() != classifyOtuSizes[i]) {
395                         m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
396                     }
397                 }else {
398                     //find rep sequence in bin
399                     for (int j = 0; j < binNames.size(); j++) {
400                         map<string, int>::iterator itNameMap = nameMap.find(binNames[j]); //if you are in the counttable you must be the rep. because get.oturep with a countfile only includes the rep sequences in the rep.count file.
401                         if (itNameMap != nameMap.end()) {
402                             seqRepName = itNameMap->first;
403                             numSeqsRep = itNameMap->second;
404                             j += binNames.size();
405                         }
406                     }
407                     
408                     if (seqRepName == "") {
409                         m->mothurOut("[ERROR: OTU " + otuLabels[i] + " is not in the count file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
410                     }
411                     
412                     if (numSeqsRep != classifyOtuSizes[i]) {
413                         m->mothurOut("[ERROR: OTU " + otuLabels[i] + " contains " + toString(numSeqsRep) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[i]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
414                     }
415                 }
416                 
417                 //output abundances
418                 if (groupfile != "") {
419                     string groupAbunds = "";
420                     map<string, int> counts;
421                     //initialize counts to 0
422                     for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; }
423                     
424                     //find abundances by group
425                     bool error = false;
426                     for (int j = 0; j < binNames.size(); j++) {
427                         string group = groupmap->getGroup(binNames[j]);
428                         if (group == "not found") {
429                             m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n");
430                             error = true;
431                         }else { counts[group]++; }
432                     }
433                     
434                     //output counts
435                     for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t';  }
436                     
437                     if (error) { m->control_pressed = true; }
438                 }else if (countfile != "") {
439                     if (ct.hasGroupInfo()) {
440                         vector<int> groupCounts = ct.getGroupCounts(seqRepName);
441                         for (int j = 0; j < groupCounts.size(); j++) { out << groupCounts[j] << '\t';  }
442                     }else { out << numSeqsRep << '\t'; }
443                 }else { out << numSeqsRep << '\t'; }
444                 
445                 //output repSeq
446                 out << seqRepName << '\t' << seqs[i].getAligned() << '\t' << taxonomies[i] << endl;
447             }
448             
449             
450             delete list;
451             if (groupfile != "") { delete groupmap; }
452            
453         }else {
454             vector<SharedRAbundVector*> lookup = getShared();
455             
456             header = "OTUNumber\t";
457             for (int i = 0; i < lookup.size(); i++) { header += lookup[i]->getGroup() + '\t'; }
458             header += "repSeqName\trepSeq\tOTUConTaxonomy";
459             out << header << endl;
460             
461             for (int h = 0; h < lookup[0]->getNumBins(); h++) {
462                 
463                 if (m->control_pressed) { break; }
464                 
465                 int index = findIndex(otuLabels, m->currentBinLabels[h]);
466                 if (index == -1) {  m->mothurOut("[ERROR]: " + m->currentBinLabels[h] + " is not in your constaxonomy file, aborting.\n"); m->control_pressed = true; }
467                 
468                 if (m->control_pressed) { break; }
469                 
470                 out << otuLabels[index] << '\t';
471                 
472                 int totalAbund = 0;
473                 for (int i = 0; i < lookup.size(); i++) { 
474                     int abund = lookup[i]->getAbundance(h);
475                     totalAbund += abund; 
476                     out << abund << '\t';
477                 }
478                 
479                 //sanity check
480                 if (totalAbund != classifyOtuSizes[index]) {
481                     m->mothurOut("[WARNING]: OTU " + m->currentBinLabels[h] + " contains " + toString(totalAbund) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); //m->control_pressed = true;   break;
482                 }
483                 
484                 //output repSeq
485                 out << seqs[index].getName() << '\t' << seqs[index].getAligned() << '\t' << taxonomies[index] << endl;
486             }
487         }
488         out.close();
489         if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
490         
491         m->mothurOutEndLine();
492                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
493                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
494                 m->mothurOutEndLine();
495         
496         return 0;
497         
498     }
499         catch(exception& e) {
500                 m->errorOut(e, "CreateDatabaseCommand", "execute");
501                 exit(1);
502         }
503 }
504 //**********************************************************************************************************************
505 int CreateDatabaseCommand::findIndex(vector<string>& otuLabels, string label){
506         try {
507         int index = -1;
508         for (int i = 0; i < otuLabels.size(); i++) {
509             if (otuLabels[i] == label) { index = i; break; }
510         }
511                 return index;
512     }
513         catch(exception& e) {
514                 m->errorOut(e, "CreateDatabaseCommand", "findIndex");
515                 exit(1);
516         }
517 }
518 //**********************************************************************************************************************
519 vector<int> CreateDatabaseCommand::readTax(vector<string>& taxonomies, vector<string>& otuLabels){
520         try {
521                 
522         vector<int> sizes; 
523         
524         ifstream in;
525         m->openInputFile(contaxonomyfile, in);
526         
527         //read headers
528         m->getline(in);
529         
530         while (!in.eof()) {
531             
532             if (m->control_pressed) { break; }
533             
534             string otu = ""; string tax = "unknown";
535             int size = 0;
536             
537             in >> otu >> size >> tax; m->gobble(in);
538             
539             sizes.push_back(size);
540             taxonomies.push_back(tax);
541             otuLabels.push_back(otu);
542         }
543         in.close();
544         
545         return sizes;
546     }
547         catch(exception& e) {
548                 m->errorOut(e, "CreateDatabaseCommand", "readTax");
549                 exit(1);
550         }
551 }
552 //**********************************************************************************************************************
553 vector<int> CreateDatabaseCommand::readFasta(vector<Sequence>& seqs){
554         try {
555                 
556         vector<int> sizes; 
557         
558         ifstream in;
559         m->openInputFile(repfastafile, in);
560         
561         set<int> sanity;
562         while (!in.eof()) {
563             
564             if (m->control_pressed) { break; }
565             
566             string binInfo;
567             Sequence seq(in, binInfo, true);  m->gobble(in);
568             
569             //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file
570             vector<string> info;
571             m->splitAtChar(binInfo, info, '|');
572             //if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;}
573             
574             int size = 0;
575             m->mothurConvert(info[1], size);
576             
577             int binNumber = 0;
578             string temp = "";
579             for (int i = 0; i < info[0].size(); i++) { if (isspace(info[0][i])) {;}else{temp +=info[0][i]; } }
580             m->mothurConvert(temp, binNumber);
581             set<int>::iterator it = sanity.find(binNumber);
582             if (it != sanity.end()) {
583                 m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;
584             }else { sanity.insert(binNumber); }
585             
586             sizes.push_back(size);
587             seqs.push_back(seq);
588         }
589         in.close();
590         
591         return sizes;
592     }
593         catch(exception& e) {
594                 m->errorOut(e, "CreateDatabaseCommand", "readFasta");
595                 exit(1);
596         }
597 }
598 //**********************************************************************************************************************
599 ListVector* CreateDatabaseCommand::getList(){
600         try {
601                 InputData* input = new InputData(listfile, "list");
602                 ListVector* list = input->getListVector();
603                 string lastLabel = list->getLabel();
604                 
605                 if (label == "") { label = lastLabel; delete input; return list; }
606                 
607                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
608                 set<string> labels; labels.insert(label);
609                 set<string> processedLabels;
610                 set<string> userLabels = labels;
611                 
612                 //as long as you are not at the end of the file or done wih the lines you want
613                 while((list != NULL) && (userLabels.size() != 0)) {
614                         if (m->control_pressed) {  delete input; return list;  }
615                         
616                         if(labels.count(list->getLabel()) == 1){
617                                 processedLabels.insert(list->getLabel());
618                                 userLabels.erase(list->getLabel());
619                                 break;
620                         }
621                         
622                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
623                                 string saveLabel = list->getLabel();
624                                 
625                                 delete list;
626                                 list = input->getListVector(lastLabel);
627                                 
628                                 processedLabels.insert(list->getLabel());
629                                 userLabels.erase(list->getLabel());
630                                 
631                                 //restore real lastlabel to save below
632                                 list->setLabel(saveLabel);
633                                 break;
634                         }
635                         
636                         lastLabel = list->getLabel();                   
637                         
638                         //get next line to process
639                         //prevent memory leak
640                         delete list; 
641                         list = input->getListVector();
642                 }
643                 
644                 
645                 if (m->control_pressed) { delete input; return list;  }
646                 
647                 //output error messages about any remaining user labels
648                 set<string>::iterator it;
649                 bool needToRun = false;
650                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
651                         m->mothurOut("Your file does not include the label " + *it); 
652                         if (processedLabels.count(lastLabel) != 1) {
653                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
654                                 needToRun = true;
655                         }else {
656                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
657                         }
658                 }
659                 
660                 //run last label if you need to
661                 if (needToRun == true)  {
662                         delete list;
663                         list = input->getListVector(lastLabel);
664                 }       
665                 
666                 delete input;
667
668         return list;
669     }
670         catch(exception& e) {
671                 m->errorOut(e, "CreateDatabaseCommand", "getList");
672                 exit(1);
673         }
674 }
675 //**********************************************************************************************************************
676 vector<SharedRAbundVector*> CreateDatabaseCommand::getShared(){
677         try {
678                 InputData input(sharedfile, "sharedfile");
679                 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
680                 string lastLabel = lookup[0]->getLabel();
681                 
682                 if (label == "") { label = lastLabel; return lookup; }
683                 
684                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
685                 set<string> labels; labels.insert(label);
686                 set<string> processedLabels;
687                 set<string> userLabels = labels;
688                 
689                 //as long as you are not at the end of the file or done wih the lines you want
690                 while((lookup[0] != NULL) && (userLabels.size() != 0)) {
691                         if (m->control_pressed) {  return lookup;  }
692                         
693                         if(labels.count(lookup[0]->getLabel()) == 1){
694                                 processedLabels.insert(lookup[0]->getLabel());
695                                 userLabels.erase(lookup[0]->getLabel());
696                                 break;
697                         }
698                         
699                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
700                                 string saveLabel = lookup[0]->getLabel();
701                                 
702                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
703                                 lookup = input.getSharedRAbundVectors(lastLabel);
704                                 
705                                 processedLabels.insert(lookup[0]->getLabel());
706                                 userLabels.erase(lookup[0]->getLabel());
707                                 
708                                 //restore real lastlabel to save below
709                                 lookup[0]->setLabel(saveLabel);
710                                 break;
711                         }
712                         
713                         lastLabel = lookup[0]->getLabel();                      
714                         
715                         //get next line to process
716                         //prevent memory leak
717                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
718                         lookup = input.getSharedRAbundVectors();
719                 }
720                 
721                 
722                 if (m->control_pressed) { return lookup;  }
723                 
724                 //output error messages about any remaining user labels
725                 set<string>::iterator it;
726                 bool needToRun = false;
727                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
728                         m->mothurOut("Your file does not include the label " + *it); 
729                         if (processedLabels.count(lastLabel) != 1) {
730                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
731                                 needToRun = true;
732                         }else {
733                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
734                         }
735                 }
736                 
737                 //run last label if you need to
738                 if (needToRun == true)  {
739                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
740                         lookup = input.getSharedRAbundVectors(lastLabel);
741                 }       
742         
743         return lookup;
744     }
745         catch(exception& e) {
746                 m->errorOut(e, "CreateDatabaseCommand", "getShared");
747                 exit(1);
748         }
749 }
750
751 //**********************************************************************************************************************
752
753