]> git.donarmstrong.com Git - mothur.git/blob - createdatabasecommand.cpp
fixes while testing 1.33.0
[mothur.git] / createdatabasecommand.cpp
1 //
2 //  createdatabasecommand.cpp
3 //  Mothur
4 //
5 //  Created by Sarah Westcott on 3/28/12.
6 //  Copyright (c) 2012 Schloss Lab. All rights reserved.
7 //
8
9 #include "createdatabasecommand.h"
10 #include "inputdata.h"
11
12 //**********************************************************************************************************************
13 vector<string> CreateDatabaseCommand::setParameters(){  
14         try {
15                 CommandParameter pfasta("repfasta", "InputTypes", "", "", "none", "none", "none","database",false,true,true); parameters.push_back(pfasta);
16                 CommandParameter pname("repname", "InputTypes", "", "", "NameCount", "NameCount", "none","",false,false,true); parameters.push_back(pname);
17         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "NameCount", "none","",false,false,true); parameters.push_back(pcount);
18                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","",false,false,true); parameters.push_back(pgroup);
19                 CommandParameter pcontaxonomy("contaxonomy", "InputTypes", "", "", "none", "none", "none","",false,true,true); parameters.push_back(pcontaxonomy);
20                 CommandParameter plist("list", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(plist);
21         CommandParameter pshared("shared", "InputTypes", "", "", "ListShared", "ListShared", "none","",false,false,true); parameters.push_back(pshared);
22                 CommandParameter plabel("label", "String", "", "", "", "", "","",false,false); parameters.push_back(plabel);
23                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
24                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
25                 
26                 vector<string> myArray;
27                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
28                 return myArray;
29         }
30         catch(exception& e) {
31                 m->errorOut(e, "CreateDatabaseCommand", "setParameters");
32                 exit(1);
33         }
34 }
35 //**********************************************************************************************************************
36 string CreateDatabaseCommand::getHelpString(){  
37         try {
38                 string helpString = "";
39                 helpString += "The create.database command reads a list file or a shared file, *.cons.taxonomy, *.rep.fasta, *.rep.names and optional groupfile, or count file and creates a database file.\n";
40                 helpString += "The create.database command parameters are repfasta, list, shared, repname, contaxonomy, group, count and label. List, repfasta, repnames or count, and contaxonomy are required.\n";
41         helpString += "The repfasta file is fasta file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
42         helpString += "The repname file is the name file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, name=yourNameFile).\n";
43         helpString += "The count file is the count file outputted by get.oturep(fasta=yourFastaFile, list=yourListfile, column=yourDistFile, count=yourCountFile). If it includes group info, mothur will give you the abundance breakdown by group. \n";
44         helpString += "The contaxonomy file is the taxonomy file outputted by classify.otu(list=yourListfile, taxonomy=yourTaxonomyFile, name=yourNameFile).\n";
45         helpString += "The group file is optional and will just give you the abundance breakdown by group.\n";
46         helpString += "The label parameter allows you to specify a label to be used from your listfile.\n";
47         helpString += "NOTE: Make SURE the repfasta, repnames and contaxonomy are for the same label as the listfile.\n";
48         helpString += "The create.database command should be in the following format: \n";
49                 helpString += "create.database(repfasta=yourFastaFileFromGetOTURep, repname=yourNameFileFromGetOTURep, contaxonomy=yourConTaxFileFromClassifyOTU, list=yourListFile) \n";       
50                 helpString += "Example: create.database(repfasta=final.an.0.03.rep.fasta, repname=final.an.0.03.rep.names, list=final.an.list, label=0.03, contaxonomy=final.an.0.03.cons.taxonomy) \n";
51                 helpString += "Note: No spaces between parameter labels (i.e. repfasta), '=' and parameters (i.e.yourFastaFileFromGetOTURep).\n";       
52                 return helpString;
53         }
54         catch(exception& e) {
55                 m->errorOut(e, "CreateDatabaseCommand", "getHelpString");
56                 exit(1);
57         }
58 }
59 //**********************************************************************************************************************
60 string CreateDatabaseCommand::getOutputPattern(string type) {
61     try {
62         string pattern = "";
63         
64         if (type == "database") {  pattern = "[filename],database"; }
65         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
66         
67         return pattern;
68     }
69     catch(exception& e) {
70         m->errorOut(e, "CreateDatabaseCommand", "getOutputPattern");
71         exit(1);
72     }
73 }
74 //**********************************************************************************************************************
75 CreateDatabaseCommand::CreateDatabaseCommand(){ 
76         try {
77                 abort = true; calledHelp = true; 
78                 setParameters();
79                 vector<string> tempOutNames;
80                 outputTypes["database"] = tempOutNames;
81         }
82         catch(exception& e) {
83                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
84                 exit(1);
85         }
86 }
87
88 //**********************************************************************************************************************
89 CreateDatabaseCommand::CreateDatabaseCommand(string option)  {
90         try{
91                 abort = false; calledHelp = false;   
92         
93                 //allow user to run help
94                 if (option == "help") { 
95                         help(); abort = true; calledHelp = true;
96                 }else if(option == "citation") { citation(); abort = true; calledHelp = true;} 
97                 else {
98                         vector<string> myArray = setParameters();
99                         
100                         OptionParser parser(option);
101                         map<string, string> parameters = parser.getParameters();
102                         
103                         ValidParameters validParameter;
104                         map<string, string>::iterator it;
105             
106                         //check to make sure all parameters are valid for command
107                         for (it = parameters.begin(); it != parameters.end(); it++) { 
108                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
109                         }
110                         
111                         //initialize outputTypes
112                         vector<string> tempOutNames;
113                         outputTypes["database"] = tempOutNames;
114             
115                         //if the user changes the input directory command factory will send this info to us in the output parameter 
116                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
117                         if (inputDir == "not found"){   inputDir = "";          }
118                         else {
119                                 string path;
120                                 it = parameters.find("list");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
126                                 }
127                                 
128                                 it = parameters.find("repname");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["repname"] = inputDir + it->second;          }
134                                 }
135                                 
136                                 it = parameters.find("contaxonomy");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["contaxonomy"] = inputDir + it->second;              }
142                                 }
143                                 
144                                 it = parameters.find("repfasta");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["repfasta"] = inputDir + it->second;         }
150                                 }
151                                 
152                                 it = parameters.find("group");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
158                                 }
159                 
160                 it = parameters.find("count");
161                                 //user has given a template file
162                                 if(it != parameters.end()){
163                                         path = m->hasPath(it->second);
164                                         //if the user has not given a path then, add inputdir. else leave path alone.
165                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
166                                 }
167                 
168                 it = parameters.find("shared");
169                                 //user has given a template file
170                                 if(it != parameters.end()){ 
171                                         path = m->hasPath(it->second);
172                                         //if the user has not given a path then, add inputdir. else leave path alone.
173                                         if (path == "") {       parameters["shared"] = inputDir + it->second;           }
174                                 }
175                         }
176             
177                         
178                         //if the user changes the output directory command factory will send this info to us in the output parameter 
179                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = "";         }
180                         
181                         //check for required parameters
182                         listfile = validParameter.validFile(parameters, "list", true);
183                         if (listfile == "not found") {  listfile = "";                  }
184                         else if (listfile == "not open") { listfile = ""; abort = true; }       
185                         else { m->setListFile(listfile); }
186             
187             sharedfile = validParameter.validFile(parameters, "shared", true);
188                         if (sharedfile == "not found") {        sharedfile = "";                        }
189                         else if (sharedfile == "not open") { sharedfile = ""; abort = true; }   
190                         else { m->setSharedFile(sharedfile); }
191             
192             if ((sharedfile == "") && (listfile == "")) { 
193                                 //is there are current file available for either of these?
194                                 //give priority to list, then shared
195                                 listfile = m->getListFile(); 
196                                 if (listfile != "") {  m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
197                                 else { 
198                                         sharedfile = m->getSharedFile(); 
199                                         if (sharedfile != "") {  m->mothurOut("Using " + sharedfile + " as input file for the shared parameter."); m->mothurOutEndLine(); }
200                                         else { 
201                                                 m->mothurOut("No valid current files. You must provide a shared or list file before you can use the create.database command."); m->mothurOutEndLine(); 
202                                                 abort = true;
203                                         }
204                                 }
205                         }
206                         else if ((sharedfile != "") && (listfile != "")) { m->mothurOut("When executing a create.database command you must enter ONLY ONE of the following: shared or list."); m->mothurOutEndLine(); abort = true; }
207             
208             if (sharedfile != "") { if (outputDir == "") { outputDir = m->hasPath(sharedfile); } }
209             else { if (outputDir == "") { outputDir = m->hasPath(listfile); } }
210                         
211                         contaxonomyfile = validParameter.validFile(parameters, "contaxonomy", true);
212                         if (contaxonomyfile == "not found") {  //if there is a current list file, use it
213                contaxonomyfile = "";  m->mothurOut("The contaxonomy parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
214                         }
215                         else if (contaxonomyfile == "not open") { contaxonomyfile = ""; abort = true; }
216
217             repfastafile = validParameter.validFile(parameters, "repfasta", true);
218                         if (repfastafile == "not found") {  //if there is a current list file, use it
219                 repfastafile = "";  m->mothurOut("The repfasta parameter is required, aborting."); m->mothurOutEndLine(); abort = true; 
220                         }
221                         else if (repfastafile == "not open") { repfastafile = ""; abort = true; }
222
223             repnamesfile = validParameter.validFile(parameters, "repname", true);
224                         if (repnamesfile == "not found") {  repnamesfile = "";                          }
225                         else if (repnamesfile == "not open") { repnamesfile = ""; abort = true; }
226             
227             countfile = validParameter.validFile(parameters, "count", true);
228                         if (countfile == "not found") {  countfile = "";                        }
229                         else if (countfile == "not open") { countfile = ""; abort = true; }
230             
231             if ((countfile == "") && (repnamesfile == "")) {
232                 //if there is a current name file, use it, else look for current count file
233                 string repnamesfile = m->getNameFile();
234                                 if (repnamesfile != "") {  m->mothurOut("Using " + repnamesfile + " as input file for the repname parameter."); m->mothurOutEndLine(); }
235                                 else {
236                     countfile = m->getCountTableFile();
237                     if (countfile != "") {  m->mothurOut("Using " + countfile + " as input file for the count parameter."); m->mothurOutEndLine(); }
238                     else { m->mothurOut("[ERROR]: You must provide a count or repname file."); m->mothurOutEndLine(); abort = true;  }
239                 }
240             }
241
242                         groupfile = validParameter.validFile(parameters, "group", true);
243                         if (groupfile == "not open") { groupfile = ""; abort = true; }  
244                         else if (groupfile == "not found") { groupfile = ""; }
245                         else { m->setGroupFile(groupfile); }
246                         
247                         //check for optional parameter and set defaults
248                         // ...at some point should added some additional type checking...
249             label = validParameter.validFile(parameters, "label", false);                       
250                         if (label == "not found") { label = ""; m->mothurOut("You did not provide a label, I will use the first label in your listfile.\n");}
251         }
252         }
253         catch(exception& e) {
254                 m->errorOut(e, "CreateDatabaseCommand", "CreateDatabaseCommand");
255                 exit(1);
256         }
257 }
258 //**********************************************************************************************************************
259 int CreateDatabaseCommand::execute(){
260         try {
261                 
262                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
263         
264         //taxonomies holds the taxonomy info for each Otu
265         //classifyOtuSizes holds the size info of each Otu to help with error checking
266         vector<string> taxonomies;
267         vector<string> otuLabels;
268         vector<int> classifyOtuSizes = readTax(taxonomies, otuLabels);
269         
270         if (m->control_pressed) { return 0; }
271         
272         vector<Sequence> seqs;
273         vector<int> repOtusSizes = readFasta(seqs);
274         
275         if (m->control_pressed) { return 0; }
276         
277         //names redundants to uniques. backwards to how we normally do it, but each bin is the list file will be a key entry in the map.
278         map<string, string> repNames;
279         map<string, int> nameMap;
280         int numUniqueNamesFile = 0;
281         CountTable ct;
282         if (countfile == "") {
283             numUniqueNamesFile = m->readNames(repnamesfile, repNames, 1);
284             //the repnames file does not have the same order as the list file bins so we need to sort and reassemble for the search below
285             map<string, string> tempRepNames;
286             for (map<string, string>::iterator it = repNames.begin(); it != repNames.end();) {
287                 string bin = it->first;
288                 vector<string> temp;
289                 m->splitAtChar(bin, temp, ',');
290                 sort(temp.begin(), temp.end());
291                 bin = "";
292                 for (int i = 0; i < temp.size()-1; i++) {
293                     bin += temp[i] + ',';
294                 }
295                 bin += temp[temp.size()-1];
296                 tempRepNames[bin] = it->second;
297                 repNames.erase(it++);
298             }
299             repNames = tempRepNames;
300         }else {
301             ct.readTable(countfile, true, false);
302             numUniqueNamesFile = ct.getNumUniqueSeqs();
303             nameMap = ct.getNameMap();
304         }
305         
306         //are there the same number of otus in the fasta and name files
307         if (repOtusSizes.size() != numUniqueNamesFile) { m->mothurOut("[ERROR]: you have " + toString(numUniqueNamesFile) + " unique seqs in your repname file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
308         
309         if (m->control_pressed) { return 0; }
310         
311         //are there the same number of OTUs in the tax and fasta file
312         if (classifyOtuSizes.size() != repOtusSizes.size()) { m->mothurOut("[ERROR]: you have " + toString(classifyOtuSizes.size()) + " taxonomies in your contaxonomy file, but " + toString(repOtusSizes.size()) + " seqs in your repfasta file.  These should match.\n"); m->control_pressed = true; }
313
314         if (m->control_pressed) { return 0; }
315         
316         //at this point we have the same number of OTUs. Are the sizes we have found so far accurate?
317         for (int i = 0; i < classifyOtuSizes.size(); i++) {
318             if (classifyOtuSizes[i] != repOtusSizes[i]) {
319                m->mothurOut("[ERROR]: OTU size info does not match for bin " + toString(i+1) + ". The contaxonomy file indicated the OTU represented " + toString(classifyOtuSizes[i]) + " sequences, but the repfasta file had " + toString(repOtusSizes[i]) + ".  These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true; 
320             }
321         }
322         
323         if (m->control_pressed) { return 0; }
324         
325         
326         map<string, string> variables; 
327         if (listfile != "") {  variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(listfile)); }
328         else { variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(sharedfile)); }
329         string outputFileName = getOutputFileName("database", variables); 
330         outputNames.push_back(outputFileName); outputTypes["database"].push_back(outputFileName);
331         
332         ofstream out;
333         m->openOutputFile(outputFileName, out);
334         
335         string header = "OTUNumber\tAbundance\t";
336
337         
338         if (listfile != "") {
339             //at this point we are fairly sure the repfasta, repnames and contaxonomy files match so lets proceed with the listfile
340             ListVector* list = getList();
341             
342             if (otuLabels.size() != list->getNumBins()) { 
343                 m->mothurOut("[ERROR]: you have " + toString(otuLabels.size()) + " otus in your contaxonomy file, but your list file has " + toString(list->getNumBins()) + " otus. These should match. Make sure you are using files for the same distance.\n"); m->control_pressed = true;  }
344             
345             if (m->control_pressed) { delete list; return 0; }
346             
347             GroupMap* groupmap = NULL;
348             if (groupfile != "") {
349                 groupmap = new GroupMap(groupfile);
350                 groupmap->readMap();
351             }
352             
353             if (m->control_pressed) { delete list; if (groupfile != "") { delete groupmap; } return 0; }
354             
355             if (groupfile != "") { 
356                 header = "OTUNumber\t";
357                 for (int i = 0; i < groupmap->getNamesOfGroups().size(); i++) { header += (groupmap->getNamesOfGroups())[i] + '\t'; }
358             }else if (countfile != "") {
359                 if (ct.hasGroupInfo()) {
360                     header = "OTUNumber\t";
361                     for (int i = 0; i < ct.getNamesOfGroups().size(); i++) { header += (ct.getNamesOfGroups())[i] + '\t'; }
362                 }
363             }
364             header += "repSeqName\trepSeq\tOTUConTaxonomy";
365             out << header << endl;
366             
367             vector<string> binLabels = list->getLabels();
368             for (int i = 0; i < list->getNumBins(); i++) {
369                 
370                 int index = findIndex(otuLabels, binLabels[i]);
371                 if (index == -1) {  m->mothurOut("[ERROR]: " + binLabels[i] + " is not in your constaxonomy file, aborting.\n"); m->control_pressed = true; }
372                 
373                 if (m->control_pressed) { break; }
374                 
375                 out << otuLabels[index] << '\t';
376                 
377                 vector<string> binNames;
378                 string bin = list->get(i);
379                 m->splitAtComma(bin, binNames);
380                 
381                 string seqRepName = "";
382                 int numSeqsRep = 0;
383                 
384                 if (countfile == "") {
385                     sort(binNames.begin(), binNames.end());
386                     bin = "";
387                     for (int j = 0; j < binNames.size()-1; j++) {
388                         bin += binNames[j] + ',';
389                     }
390                     bin += binNames[binNames.size()-1];
391                     map<string, string>::iterator it = repNames.find(bin);
392                     
393                     if (it == repNames.end()) {
394                         m->mothurOut("[ERROR: OTU " + otuLabels[index] + " is not in the repnames file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
395                     }else { seqRepName = it->second;  numSeqsRep = binNames.size(); }
396                     
397                     //sanity check
398                     if (binNames.size() != classifyOtuSizes[index]) {
399                         m->mothurOut("[ERROR: OTU " + otuLabels[index] + " contains " + toString(binNames.size()) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
400                     }
401                 }else {
402                     //find rep sequence in bin
403                     for (int j = 0; j < binNames.size(); j++) {
404                         map<string, int>::iterator itNameMap = nameMap.find(binNames[j]); //if you are in the counttable you must be the rep. because get.oturep with a countfile only includes the rep sequences in the rep.count file.
405                         if (itNameMap != nameMap.end()) {
406                             seqRepName = itNameMap->first;
407                             numSeqsRep = itNameMap->second;
408                             j += binNames.size();
409                         }
410                     }
411                     
412                     if (seqRepName == "") {
413                         m->mothurOut("[ERROR: OTU " + otuLabels[index] + " is not in the count file. Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
414                     }
415                     
416                     if (numSeqsRep != classifyOtuSizes[i]) {
417                         m->mothurOut("[ERROR: OTU " + otuLabels[index] + " contains " + toString(numSeqsRep) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); m->control_pressed = true;   break;
418                     }
419                 }
420                 
421                 //output abundances
422                 if (groupfile != "") {
423                     string groupAbunds = "";
424                     map<string, int> counts;
425                     //initialize counts to 0
426                     for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { counts[(groupmap->getNamesOfGroups())[j]] = 0; }
427                     
428                     //find abundances by group
429                     bool error = false;
430                     for (int j = 0; j < binNames.size(); j++) {
431                         string group = groupmap->getGroup(binNames[j]);
432                         if (group == "not found") {
433                             m->mothurOut("[ERROR]: " + binNames[j] + " is not in your groupfile, please correct.\n");
434                             error = true;
435                         }else { counts[group]++; }
436                     }
437                     
438                     //output counts
439                     for (int j = 0; j < groupmap->getNamesOfGroups().size(); j++) { out << counts[(groupmap->getNamesOfGroups())[j]] << '\t';  }
440                     
441                     if (error) { m->control_pressed = true; }
442                 }else if (countfile != "") {
443                     if (ct.hasGroupInfo()) {
444                         vector<int> groupCounts = ct.getGroupCounts(seqRepName);
445                         for (int j = 0; j < groupCounts.size(); j++) { out << groupCounts[j] << '\t';  }
446                     }else { out << numSeqsRep << '\t'; }
447                 }else { out << numSeqsRep << '\t'; }
448                 
449                 //output repSeq
450                 out << seqRepName << '\t' << seqs[index].getAligned() << '\t' << taxonomies[index] << endl;
451             }
452             
453             
454             delete list;
455             if (groupfile != "") { delete groupmap; }
456            
457         }else {
458             vector<SharedRAbundVector*> lookup = getShared();
459             
460             header = "OTUNumber\t";
461             for (int i = 0; i < lookup.size(); i++) { header += lookup[i]->getGroup() + '\t'; }
462             header += "repSeqName\trepSeq\tOTUConTaxonomy";
463             out << header << endl;
464             
465             for (int h = 0; h < lookup[0]->getNumBins(); h++) {
466                 
467                 if (m->control_pressed) { break; }
468                 
469                 int index = findIndex(otuLabels, m->currentSharedBinLabels[h]);
470                 if (index == -1) {  m->mothurOut("[ERROR]: " + m->currentSharedBinLabels[h] + " is not in your constaxonomy file, aborting.\n"); m->control_pressed = true; }
471                 
472                 if (m->control_pressed) { break; }
473                 
474                 out << otuLabels[index] << '\t';
475                 
476                 int totalAbund = 0;
477                 for (int i = 0; i < lookup.size(); i++) { 
478                     int abund = lookup[i]->getAbundance(h);
479                     totalAbund += abund; 
480                     out << abund << '\t';
481                 }
482                 
483                 //sanity check
484                 if (totalAbund != classifyOtuSizes[index]) {
485                     m->mothurOut("[WARNING]: OTU " + m->currentSharedBinLabels[h] + " contains " + toString(totalAbund) + " sequence, but the rep and taxonomy files indicated this OTU should have " + toString(classifyOtuSizes[index]) + ". Make sure you are using files for the same distance.\n"); //m->control_pressed = true;   break;
486                 }
487                 
488                 //output repSeq
489                 out << seqs[index].getName() << '\t' << seqs[index].getAligned() << '\t' << taxonomies[index] << endl;
490             }
491         }
492         out.close();
493         if (m->control_pressed) { m->mothurRemove(outputFileName); return 0; }
494         
495         m->mothurOutEndLine();
496                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
497                 m->mothurOut(outputFileName); m->mothurOutEndLine();    
498                 m->mothurOutEndLine();
499         
500         return 0;
501         
502     }
503         catch(exception& e) {
504                 m->errorOut(e, "CreateDatabaseCommand", "execute");
505                 exit(1);
506         }
507 }
508 //**********************************************************************************************************************
509 int CreateDatabaseCommand::findIndex(vector<string>& otuLabels, string label){
510         try {
511         int index = -1;
512         for (int i = 0; i < otuLabels.size(); i++) {
513             if (m->isLabelEquivalent(otuLabels[i],label)) { index = i; break; }
514         }
515                 return index;
516     }
517         catch(exception& e) {
518                 m->errorOut(e, "CreateDatabaseCommand", "findIndex");
519                 exit(1);
520         }
521 }
522 //**********************************************************************************************************************
523 vector<int> CreateDatabaseCommand::readTax(vector<string>& taxonomies, vector<string>& otuLabels){
524         try {
525                 
526         vector<int> sizes; 
527         
528         ifstream in;
529         m->openInputFile(contaxonomyfile, in);
530         
531         //read headers
532         m->getline(in);
533         
534         while (!in.eof()) {
535             
536             if (m->control_pressed) { break; }
537             
538             string otu = ""; string tax = "unknown";
539             int size = 0;
540             
541             in >> otu >> size >> tax; m->gobble(in);
542             
543             sizes.push_back(size);
544             taxonomies.push_back(tax);
545             otuLabels.push_back(otu);
546         }
547         in.close();
548         
549         return sizes;
550     }
551         catch(exception& e) {
552                 m->errorOut(e, "CreateDatabaseCommand", "readTax");
553                 exit(1);
554         }
555 }
556 //**********************************************************************************************************************
557 vector<int> CreateDatabaseCommand::readFasta(vector<Sequence>& seqs){
558         try {
559                 
560         vector<int> sizes; 
561         
562         ifstream in;
563         m->openInputFile(repfastafile, in);
564         
565         set<int> sanity;
566         while (!in.eof()) {
567             
568             if (m->control_pressed) { break; }
569             
570             string binInfo;
571             Sequence seq(in, binInfo, true);  m->gobble(in);
572             
573             //the binInfo should look like - binNumber|size ie. 1|200 if it is binNumber|size|group then the user gave us the wrong repfasta file
574             vector<string> info;
575             m->splitAtChar(binInfo, info, '|');
576             //if (info.size() != 2) { m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;}
577             
578             int size = 0;
579             m->mothurConvert(info[1], size);
580             
581             int binNumber = 0;
582             string temp = "";
583             for (int i = 0; i < info[0].size(); i++) { if (isspace(info[0][i])) {;}else{temp +=info[0][i]; } }
584             m->mothurConvert(temp, binNumber);
585             set<int>::iterator it = sanity.find(binNumber);
586             if (it != sanity.end()) {
587                 m->mothurOut("[ERROR]: your repfasta file is not the right format.  The create database command is designed to be used with the output from get.oturep.  When running get.oturep you can not use a group file, because mothur is only expecting one representative sequence per OTU and when you use a group file with get.oturep a representative is found for each group.\n");  m->control_pressed = true; break;
588             }else { sanity.insert(binNumber); }
589             
590             sizes.push_back(size);
591             seqs.push_back(seq);
592         }
593         in.close();
594         
595         return sizes;
596     }
597         catch(exception& e) {
598                 m->errorOut(e, "CreateDatabaseCommand", "readFasta");
599                 exit(1);
600         }
601 }
602 //**********************************************************************************************************************
603 ListVector* CreateDatabaseCommand::getList(){
604         try {
605                 InputData* input = new InputData(listfile, "list");
606                 ListVector* list = input->getListVector();
607                 string lastLabel = list->getLabel();
608                 
609                 if (label == "") { label = lastLabel; delete input; return list; }
610                 
611                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
612                 set<string> labels; labels.insert(label);
613                 set<string> processedLabels;
614                 set<string> userLabels = labels;
615                 
616                 //as long as you are not at the end of the file or done wih the lines you want
617                 while((list != NULL) && (userLabels.size() != 0)) {
618                         if (m->control_pressed) {  delete input; return list;  }
619                         
620                         if(labels.count(list->getLabel()) == 1){
621                                 processedLabels.insert(list->getLabel());
622                                 userLabels.erase(list->getLabel());
623                                 break;
624                         }
625                         
626                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
627                                 string saveLabel = list->getLabel();
628                                 
629                                 delete list;
630                                 list = input->getListVector(lastLabel);
631                                 
632                                 processedLabels.insert(list->getLabel());
633                                 userLabels.erase(list->getLabel());
634                                 
635                                 //restore real lastlabel to save below
636                                 list->setLabel(saveLabel);
637                                 break;
638                         }
639                         
640                         lastLabel = list->getLabel();                   
641                         
642                         //get next line to process
643                         //prevent memory leak
644                         delete list; 
645                         list = input->getListVector();
646                 }
647                 
648                 
649                 if (m->control_pressed) { delete input; return list;  }
650                 
651                 //output error messages about any remaining user labels
652                 set<string>::iterator it;
653                 bool needToRun = false;
654                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
655                         m->mothurOut("Your file does not include the label " + *it); 
656                         if (processedLabels.count(lastLabel) != 1) {
657                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
658                                 needToRun = true;
659                         }else {
660                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
661                         }
662                 }
663                 
664                 //run last label if you need to
665                 if (needToRun == true)  {
666                         delete list;
667                         list = input->getListVector(lastLabel);
668                 }       
669                 
670                 delete input;
671
672         return list;
673     }
674         catch(exception& e) {
675                 m->errorOut(e, "CreateDatabaseCommand", "getList");
676                 exit(1);
677         }
678 }
679 //**********************************************************************************************************************
680 vector<SharedRAbundVector*> CreateDatabaseCommand::getShared(){
681         try {
682                 InputData input(sharedfile, "sharedfile");
683                 vector<SharedRAbundVector*> lookup = input.getSharedRAbundVectors();
684                 string lastLabel = lookup[0]->getLabel();
685                 
686                 if (label == "") { label = lastLabel; return lookup; }
687                 
688                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
689                 set<string> labels; labels.insert(label);
690                 set<string> processedLabels;
691                 set<string> userLabels = labels;
692                 
693                 //as long as you are not at the end of the file or done wih the lines you want
694                 while((lookup[0] != NULL) && (userLabels.size() != 0)) {
695                         if (m->control_pressed) {  return lookup;  }
696                         
697                         if(labels.count(lookup[0]->getLabel()) == 1){
698                                 processedLabels.insert(lookup[0]->getLabel());
699                                 userLabels.erase(lookup[0]->getLabel());
700                                 break;
701                         }
702                         
703                         if ((m->anyLabelsToProcess(lookup[0]->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
704                                 string saveLabel = lookup[0]->getLabel();
705                                 
706                                 for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
707                                 lookup = input.getSharedRAbundVectors(lastLabel);
708                                 
709                                 processedLabels.insert(lookup[0]->getLabel());
710                                 userLabels.erase(lookup[0]->getLabel());
711                                 
712                                 //restore real lastlabel to save below
713                                 lookup[0]->setLabel(saveLabel);
714                                 break;
715                         }
716                         
717                         lastLabel = lookup[0]->getLabel();                      
718                         
719                         //get next line to process
720                         //prevent memory leak
721                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
722                         lookup = input.getSharedRAbundVectors();
723                 }
724                 
725                 
726                 if (m->control_pressed) { return lookup;  }
727                 
728                 //output error messages about any remaining user labels
729                 set<string>::iterator it;
730                 bool needToRun = false;
731                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
732                         m->mothurOut("Your file does not include the label " + *it); 
733                         if (processedLabels.count(lastLabel) != 1) {
734                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
735                                 needToRun = true;
736                         }else {
737                                 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
738                         }
739                 }
740                 
741                 //run last label if you need to
742                 if (needToRun == true)  {
743                         for (int i = 0; i < lookup.size(); i++) {  delete lookup[i];  }
744                         lookup = input.getSharedRAbundVectors(lastLabel);
745                 }       
746         
747         return lookup;
748     }
749         catch(exception& e) {
750                 m->errorOut(e, "CreateDatabaseCommand", "getShared");
751                 exit(1);
752         }
753 }
754
755 //**********************************************************************************************************************
756
757