]> git.donarmstrong.com Git - mothur.git/blob - binsequencecommand.cpp
Merge remote-tracking branch 'mothur/master'
[mothur.git] / binsequencecommand.cpp
1 /*
2  *  binsequencecommand.cpp
3  *  Mothur
4  *
5  *  Created by Sarah Westcott on 4/3/09.
6  *  Copyright 2009 Schloss Lab UMASS Amhers. All rights reserved.
7  *
8  */
9
10 #include "binsequencecommand.h"
11
12
13 //**********************************************************************************************************************
14 vector<string> BinSeqCommand::setParameters(){  
15         try {
16                 CommandParameter plist("list", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(plist);
17                 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
18         CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
19         CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
20                 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
21                 CommandParameter plabel("label", "String", "", "", "", "", "",false,false); parameters.push_back(plabel);
22                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
23                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
24         
25                 vector<string> myArray;
26                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
27                 return myArray;
28         }
29         catch(exception& e) {
30                 m->errorOut(e, "BinSeqCommand", "setParameters");
31                 exit(1);
32         }
33 }
34 //**********************************************************************************************************************
35 string BinSeqCommand::getHelpString(){  
36         try {
37                 string helpString = "";
38                 helpString += "The bin.seqs command parameters are list, fasta, name, count, label and group.  The fasta and list are required, unless you have a valid current list and fasta file.\n";
39                 helpString += "The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n";
40                 helpString += "The bin.seqs command should be in the following format: bin.seqs(fasta=yourFastaFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n";
41                 helpString += "Example bin.seqs(fasta=amazon.fasta, group=amazon.groups, name=amazon.names).\n";
42                 helpString += "The default value for label is all lines in your inputfile.\n";
43                 helpString += "The bin.seqs command outputs a .fasta file for each distance you specify appending the OTU number to each name.\n";
44                 helpString += "If you provide a groupfile, then it also appends the sequences group to the name.\n";
45                 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "BinSeqCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string BinSeqCommand::getOutputFileNameTag(string type, string inputName=""){   
55         try {
56         string outputFileName = "";
57                 map<string, vector<string> >::iterator it;
58         
59         //is this a type this command creates
60         it = outputTypes.find(type);
61         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
62         else {
63             if (type == "fasta") {  outputFileName =  "fasta"; }
64             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
65         }
66         return outputFileName;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "BinSeqCommand", "getOutputFileNameTag");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74 BinSeqCommand::BinSeqCommand(){ 
75         try {
76                 abort = true; calledHelp = true; 
77                 setParameters();
78                 vector<string> tempOutNames;
79                 outputTypes["fasta"] = tempOutNames;
80         }
81         catch(exception& e) {
82                 m->errorOut(e, "BinSeqCommand", "BinSeqCommand");
83                 exit(1);
84         }
85 }
86 //**********************************************************************************************************************
87 BinSeqCommand::BinSeqCommand(string option) {
88         try {
89                 abort = false; calledHelp = false;   
90                 allLines = 1;
91                 labels.clear();
92                 
93                 //allow user to run help
94                 if(option == "help") { help(); abort = true; calledHelp = true; }
95                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
96                 
97                 else {
98                         vector<string> myArray = setParameters();
99                         
100                         OptionParser parser(option);
101                         map<string, string> parameters = parser.getParameters();
102                         
103                         ValidParameters validParameter;
104                         map<string, string>::iterator it;
105                 
106                         //check to make sure all parameters are valid for command
107                         for (it = parameters.begin(); it != parameters.end(); it++) { 
108                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
109                         }
110                         
111                         //initialize outputTypes
112                         vector<string> tempOutNames;
113                         outputTypes["fasta"] = tempOutNames;
114                         
115                         //if the user changes the input directory command factory will send this info to us in the output parameter 
116                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
117                         if (inputDir == "not found"){   inputDir = "";          }
118                         else {
119                                 string path;
120                                 it = parameters.find("fasta");
121                                 //user has given a template file
122                                 if(it != parameters.end()){ 
123                                         path = m->hasPath(it->second);
124                                         //if the user has not given a path then, add inputdir. else leave path alone.
125                                         if (path == "") {       parameters["fasta"] = inputDir + it->second;            }
126                                 }
127                                 
128                                 it = parameters.find("list");
129                                 //user has given a template file
130                                 if(it != parameters.end()){ 
131                                         path = m->hasPath(it->second);
132                                         //if the user has not given a path then, add inputdir. else leave path alone.
133                                         if (path == "") {       parameters["list"] = inputDir + it->second;             }
134                                 }
135                                 
136                                 it = parameters.find("name");
137                                 //user has given a template file
138                                 if(it != parameters.end()){ 
139                                         path = m->hasPath(it->second);
140                                         //if the user has not given a path then, add inputdir. else leave path alone.
141                                         if (path == "") {       parameters["name"] = inputDir + it->second;             }
142                                 }
143                                 
144                                 it = parameters.find("group");
145                                 //user has given a template file
146                                 if(it != parameters.end()){ 
147                                         path = m->hasPath(it->second);
148                                         //if the user has not given a path then, add inputdir. else leave path alone.
149                                         if (path == "") {       parameters["group"] = inputDir + it->second;            }
150                                 }
151                 
152                 it = parameters.find("count");
153                                 //user has given a template file
154                                 if(it != parameters.end()){ 
155                                         path = m->hasPath(it->second);
156                                         //if the user has not given a path then, add inputdir. else leave path alone.
157                                         if (path == "") {       parameters["count"] = inputDir + it->second;            }
158                                 }
159                         }
160
161                         
162                         //check for required parameters
163                         fastafile = validParameter.validFile(parameters, "fasta", true);
164                         if (fastafile == "not found") {                                 //if there is a current phylip file, use it
165                                 fastafile = m->getFastaFile(); 
166                                 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
167                                 else {  m->mothurOut("You have no current fasta file and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
168                         }
169                         else if (fastafile == "not open") { abort = true; }     
170                         else { m->setFastaFile(fastafile); }
171                         
172                         listfile = validParameter.validFile(parameters, "list", true);
173                         if (listfile == "not found") {                  
174                                 listfile = m->getListFile(); 
175                                 if (listfile != "") { m->mothurOut("Using " + listfile + " as input file for the list parameter."); m->mothurOutEndLine(); }
176                                 else {  m->mothurOut("You have no current list file and the list parameter is required."); m->mothurOutEndLine(); abort = true; }
177                         }
178                         else if (listfile == "not open") { listfile = ""; abort = true; }       
179                         else { m->setListFile(listfile); }
180                         
181                         //if the user changes the output directory command factory will send this info to us in the output parameter 
182                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
183                                 outputDir = ""; 
184                                 outputDir += m->hasPath(listfile); //if user entered a file with a path then preserve it        
185                         }
186                         
187                 
188                         //check for optional parameter and set defaults
189                         // ...at some point should added some additional type checking...
190                         
191                         label = validParameter.validFile(parameters, "label", false);                   
192                         if (label == "not found") { label = ""; }
193                         else { 
194                                 if(label != "all") {  m->splitAtDash(label, labels);  allLines = 0;  }
195                                 else { allLines = 1;  }
196                         }
197                         
198                         namesfile = validParameter.validFile(parameters, "name", true);
199                         if (namesfile == "not open") { namesfile = ""; abort = true; }  
200                         else if (namesfile == "not found") { namesfile = ""; }
201                         else {  m->setNameFile(namesfile); }
202
203                         groupfile = validParameter.validFile(parameters, "group", true);
204                         if (groupfile == "not open") { abort = true; }
205                         else if (groupfile == "not found") { groupfile = ""; }
206                         else { m->setGroupFile(groupfile); }
207             
208             countfile = validParameter.validFile(parameters, "count", true);
209                         if (countfile == "not open") { countfile = ""; abort = true; }
210                         else if (countfile == "not found") { countfile = "";  } 
211                         else { m->setCountTableFile(countfile); }
212             
213             if ((namesfile != "") && (countfile != "")) {
214                 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
215             }
216                         
217             if ((groupfile != "") && (countfile != "")) {
218                 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
219             }
220                         
221             if (countfile == "") {
222                 if (namesfile == ""){
223                     vector<string> files; files.push_back(fastafile); 
224                     parser.getNameFile(files);
225                 }
226             }
227                         
228                 }
229         }
230         catch(exception& e) {
231                 m->errorOut(e, "BinSeqCommand", "BinSeqCommand");
232                 exit(1);
233         }
234 }
235 //**********************************************************************************************************************
236
237 BinSeqCommand::~BinSeqCommand(){}
238 //**********************************************************************************************************************
239
240 int BinSeqCommand::execute(){
241         try {
242                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
243         
244                 int error = 0;
245                 
246                 fasta = new FastaMap();
247                 if (groupfile != "") {
248                         groupMap = new GroupMap(groupfile);
249                         groupMap->readMap();
250                 }
251                 
252                 //read fastafile
253                 fasta->readFastaFile(fastafile);
254                 
255                 //if user gave a namesfile then use it
256                 if (namesfile != "") {  readNamesFile();  }
257         if (countfile != "") {  ct.readTable(countfile);  }
258                 
259                 input = new InputData(listfile, "list");
260                 list = input->getListVector();
261                 string lastLabel = list->getLabel();
262                 
263                 if (m->control_pressed) {  delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; }
264                 
265                 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
266                 set<string> processedLabels;
267                 set<string> userLabels = labels;
268
269                                 
270                 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
271                         
272                         if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);                } delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; } 
273                         
274                         if(allLines == 1 || labels.count(list->getLabel()) == 1){
275                                 
276                                 error = process(list);  
277                                 if (error == 1) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);                } delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; } 
278                                                         
279                                 processedLabels.insert(list->getLabel());
280                                 userLabels.erase(list->getLabel());
281                         }
282                         
283                         if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
284                                 string saveLabel = list->getLabel();
285                                 
286                                 delete list;
287                                 list = input->getListVector(lastLabel);
288                                 
289                                 error = process(list);  
290                                 if (error == 1) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);                } delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; }
291                                                                                                         
292                                 processedLabels.insert(list->getLabel());
293                                 userLabels.erase(list->getLabel());
294                                 
295                                 //restore real lastlabel to save below
296                                 list->setLabel(saveLabel);
297                         }
298                         
299                         lastLabel = list->getLabel();                   
300                         
301                         delete list;
302                         list = input->getListVector();
303                 }
304                 
305                 if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);                } delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; } 
306
307                 //output error messages about any remaining user labels
308                 set<string>::iterator it;
309                 bool needToRun = false;
310                 for (it = userLabels.begin(); it != userLabels.end(); it++) {  
311                         m->mothurOut("Your file does not include the label " + *it); 
312                         if (processedLabels.count(lastLabel) != 1) {
313                                 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
314                                 needToRun = true;
315                         }else {
316                                 m->mothurOut(". Please refer to " + lastLabel + ".");  m->mothurOutEndLine();
317                         }
318                 }
319                 
320                 //run last label if you need to
321                 if (needToRun == true)  {
322                         if (list != NULL) {             delete list;    }
323                         list = input->getListVector(lastLabel);
324                                 
325                         error = process(list);  
326                         if (error == 1) { for (int i = 0; i < outputNames.size(); i++) {        m->mothurRemove(outputNames[i]);                } delete input;  delete fasta; if (groupfile != "") {  delete groupMap;   } return 0; }
327                         
328                         delete list;  
329                 }
330                 
331                 delete input;  
332                 delete fasta; 
333                 if (groupfile != "") {  delete groupMap;   } 
334                 
335                 if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]);                }  return 0; }  
336                 
337                 m->mothurOutEndLine();
338                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
339                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
340                 m->mothurOutEndLine();
341
342                 
343                 return 0;
344         }
345         catch(exception& e) {
346                 m->errorOut(e, "BinSeqCommand", "execute");
347                 exit(1);
348         }
349 }
350
351 //**********************************************************************************************************************
352 void BinSeqCommand::readNamesFile() {
353         try {
354                 vector<string> dupNames;
355                 m->openInputFile(namesfile, inNames);
356                 
357                 string name, names, sequence;
358         
359                 while(inNames){
360                         inNames >> name;                        //read from first column  A
361                         inNames >> names;               //read from second column  A,B,C,D
362                         
363                         dupNames.clear();
364                         
365                         //parse names into vector
366                         m->splitAtComma(names, dupNames);
367                         
368                         //store names in fasta map
369                         sequence = fasta->getSequence(name);
370                         for (int i = 0; i < dupNames.size(); i++) {
371                                 fasta->push_back(dupNames[i], sequence);
372                         }
373                 
374                         m->gobble(inNames);
375                 }
376                 inNames.close();
377
378         }
379         catch(exception& e) {
380                 m->errorOut(e, "BinSeqCommand", "readNamesFile");
381                 exit(1);
382         }
383 }
384 //**********************************************************************************************************************
385 //return 1 if error, 0 otherwise
386 int BinSeqCommand::process(ListVector* list) {
387         try {
388         string outputFileName = outputDir + m->getRootName(m->getSimpleName(listfile)) + list->getLabel() + "." + getOutputFileNameTag("fasta");
389         m->openOutputFile(outputFileName, out);
390         outputNames.push_back(outputFileName);  outputTypes["fasta"].push_back(outputFileName);
391         
392         m->mothurOut(list->getLabel()); m->mothurOutEndLine();
393         
394         //for each bin in the list vector
395         for (int i = 0; i < list->size(); i++) {
396             
397             if (m->control_pressed) {  return 1; }
398             
399             string binnames = list->get(i);
400             vector<string> names;
401             m->splitAtComma(binnames, names);
402             for (int j = 0; j < names.size(); j++) {
403                 string name = names[j];
404                 
405                 //do work for that name
406                 string sequence = fasta->getSequence(name);
407                 
408                 if (countfile != "") {
409                     if (sequence != "not found") {
410                         if (ct.hasGroupInfo()) {
411                             vector<string> groups = ct.getGroups(name);
412                             string groupInfo = "";
413                             for (int k = 0; k < groups.size()-1; k++) {
414                                 groupInfo += groups[k] + "-";
415                             }
416                             if (groups.size() != 0) { groupInfo += groups[groups.size()-1]; }
417                             else { groupInfo = "not found";  }
418                             name = name + "\t" + groupInfo + "\t" + toString(i+1)+ "\tNumRep=" + toString(ct.getNumSeqs(name));
419                             out << ">" << name << endl;
420                             out << sequence << endl;
421                         }else {
422                             name = name + "\t" + toString(i+1) + "\tNumRep=" + toString(ct.getNumSeqs(name));
423                             out << ">" << name << endl;
424                             out << sequence << endl;
425                         }
426                         
427                     }else { m->mothurOut(name + " is missing from your fasta. Does your list file contain all sequence names or just the uniques?"); m->mothurOutEndLine(); return 1; }
428                 }else {
429                     if (sequence != "not found") {
430                         //if you don't have groups
431                         if (groupfile == "") {
432                             name = name + "\t" + toString(i+1);
433                             out << ">" << name << endl;
434                             out << sequence << endl;
435                         }else {//if you do have groups
436                             string group = groupMap->getGroup(name);
437                             if (group == "not found") {  
438                                 m->mothurOut(name + " is missing from your group file. Please correct. ");  m->mothurOutEndLine();
439                                 return 1;
440                             }else{
441                                 name = name + "\t" + group + "\t" + toString(i+1);
442                                 out << ">" << name << endl;
443                                 out << sequence << endl;
444                             }
445                         }
446                     }else { m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine(); return 1; }
447                 }
448             }
449         }
450         
451         out.close();
452         return 0;
453
454         }
455         catch(exception& e) {
456                 m->errorOut(e, "BinSeqCommand", "process");
457                 exit(1);
458         }
459 }
460 //**********************************************************************************************************************
461
462