2 * binsequencecommand.cpp
5 * Created by Sarah Westcott on 4/3/09.
6 * Copyright 2009 Schloss Lab UMASS Amhers. All rights reserved.
10 #include "binsequencecommand.h"
12 //**********************************************************************************************************************
13 vector<string> BinSeqCommand::getValidParameters(){
15 string AlignArray[] = {"fasta","label","name", "group","outputdir","inputdir"};
16 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
20 m->errorOut(e, "BinSeqCommand", "getValidParameters");
24 //**********************************************************************************************************************
25 vector<string> BinSeqCommand::getRequiredParameters(){
27 string AlignArray[] = {"fasta"};
28 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
32 m->errorOut(e, "BinSeqCommand", "getRequiredParameters");
36 //**********************************************************************************************************************
37 vector<string> BinSeqCommand::getRequiredFiles(){
39 string AlignArray[] = {"list"};
40 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
44 m->errorOut(e, "BinSeqCommand", "getRequiredFiles");
48 //**********************************************************************************************************************
49 BinSeqCommand::BinSeqCommand(){
52 //initialize outputTypes
53 vector<string> tempOutNames;
54 outputTypes["fasta"] = tempOutNames;
57 m->errorOut(e, "BinSeqCommand", "BinSeqCommand");
61 //**********************************************************************************************************************
62 BinSeqCommand::BinSeqCommand(string option) {
64 globaldata = GlobalData::getInstance();
69 //allow user to run help
70 if(option == "help") { help(); abort = true; }
73 //valid paramters for this command
74 string AlignArray[] = {"fasta","label","name", "group","outputdir","inputdir"};
75 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
77 OptionParser parser(option);
78 map<string, string> parameters = parser.getParameters();
80 ValidParameters validParameter;
81 map<string, string>::iterator it;
83 //check to make sure all parameters are valid for command
84 for (it = parameters.begin(); it != parameters.end(); it++) {
85 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
88 //initialize outputTypes
89 vector<string> tempOutNames;
90 outputTypes["fasta"] = tempOutNames;
92 //if the user changes the output directory command factory will send this info to us in the output parameter
93 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
95 outputDir += m->hasPath(globaldata->getListFile()); //if user entered a file with a path then preserve it
99 //make sure the user has already run the read.otu command
100 if (globaldata->getListFile() == "") {
101 m->mothurOut("You must read a listfile before running the bin.seqs command.");
102 m->mothurOutEndLine();
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("fasta");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["fasta"] = inputDir + it->second; }
119 it = parameters.find("name");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["name"] = inputDir + it->second; }
127 it = parameters.find("group");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["group"] = inputDir + it->second; }
137 //check for required parameters
138 fastafile = validParameter.validFile(parameters, "fasta", true);
139 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the bin.seqs command."); m->mothurOutEndLine(); abort = true; }
140 else if (fastafile == "not open") { abort = true; }
142 //check for optional parameter and set defaults
143 // ...at some point should added some additional type checking...
145 label = validParameter.validFile(parameters, "label", false);
146 if (label == "not found") { label = ""; }
148 if(label != "all") { m->splitAtDash(label, labels); allLines = 0; }
149 else { allLines = 1; }
152 //if the user has not specified any labels use the ones from read.otu
154 allLines = globaldata->allLines;
155 labels = globaldata->labels;
158 namesfile = validParameter.validFile(parameters, "name", true);
159 if (namesfile == "not open") { abort = true; }
160 else if (namesfile == "not found") { namesfile = ""; }
162 groupfile = validParameter.validFile(parameters, "group", true);
163 if (groupfile == "not open") { abort = true; }
164 else if (groupfile == "not found") { groupfile = ""; }
166 if (abort == false) {
167 // m->openInputFile(fastafile, in);
168 fasta = new FastaMap();
169 if (groupfile != "") {
170 groupMap = new GroupMap(groupfile);
172 int error = groupMap->readMap();
173 if (error == 1) { delete groupMap; abort = true; }
179 catch(exception& e) {
180 m->errorOut(e, "BinSeqCommand", "BinSeqCommand");
184 //**********************************************************************************************************************
186 void BinSeqCommand::help(){
188 m->mothurOut("The bin.seqs command can only be executed after a successful read.otu command of a listfile.\n");
189 m->mothurOut("The bin.seqs command parameters are fasta, name, label and group. The fasta parameter is required.\n");
190 m->mothurOut("The label parameter allows you to select what distance levels you would like a output files created for, and are separated by dashes.\n");
191 m->mothurOut("The bin.seqs command should be in the following format: bin.seqs(fasta=yourFastaFile, name=yourNamesFile, group=yourGroupFile, label=yourLabels).\n");
192 m->mothurOut("Example bin.seqs(fasta=amazon.fasta, group=amazon.groups, name=amazon.names).\n");
193 m->mothurOut("The default value for label is all lines in your inputfile.\n");
194 m->mothurOut("The bin.seqs command outputs a .fasta file for each distance you specify appending the OTU number to each name.\n");
195 m->mothurOut("If you provide a groupfile, then it also appends the sequences group to the name.\n");
196 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFastaFile).\n\n");
198 catch(exception& e) {
199 m->errorOut(e, "BinSeqCommand", "help");
204 //**********************************************************************************************************************
206 BinSeqCommand::~BinSeqCommand(){
207 //made new in execute
208 if (abort == false) {
209 delete input; globaldata->ginput = NULL;
211 globaldata->gListVector = NULL;
213 if (groupfile != "") { delete groupMap; globaldata->gGroupmap = NULL; }
217 //**********************************************************************************************************************
219 int BinSeqCommand::execute(){
221 if (abort == true) { return 0; }
226 fasta->readFastaFile(fastafile);
229 //set format to list so input can get listvector
230 // globaldata->setFormat("list");
232 //if user gave a namesfile then use it
233 if (namesfile != "") {
238 read = new ReadOTUFile(globaldata->getListFile());
239 read->read(&*globaldata);
241 input = globaldata->ginput;
242 list = globaldata->gListVector;
243 string lastLabel = list->getLabel();
245 if (m->control_pressed) { return 0; }
247 //if the users enters label "0.06" and there is no "0.06" in their file use the next lowest label.
248 set<string> processedLabels;
249 set<string> userLabels = labels;
252 while((list != NULL) && ((allLines == 1) || (userLabels.size() != 0))) {
254 if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
256 if(allLines == 1 || labels.count(list->getLabel()) == 1){
258 error = process(list);
259 if (error == 1) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
261 processedLabels.insert(list->getLabel());
262 userLabels.erase(list->getLabel());
265 if ((m->anyLabelsToProcess(list->getLabel(), userLabels, "") == true) && (processedLabels.count(lastLabel) != 1)) {
266 string saveLabel = list->getLabel();
269 list = input->getListVector(lastLabel);
271 error = process(list);
272 if (error == 1) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
274 processedLabels.insert(list->getLabel());
275 userLabels.erase(list->getLabel());
277 //restore real lastlabel to save below
278 list->setLabel(saveLabel);
281 lastLabel = list->getLabel();
284 list = input->getListVector();
287 if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
289 //output error messages about any remaining user labels
290 set<string>::iterator it;
291 bool needToRun = false;
292 for (it = userLabels.begin(); it != userLabels.end(); it++) {
293 m->mothurOut("Your file does not include the label " + *it);
294 if (processedLabels.count(lastLabel) != 1) {
295 m->mothurOut(". I will use " + lastLabel + "."); m->mothurOutEndLine();
298 m->mothurOut(". Please refer to " + lastLabel + "."); m->mothurOutEndLine();
302 //run last label if you need to
303 if (needToRun == true) {
304 if (list != NULL) { delete list; }
305 list = input->getListVector(lastLabel);
307 error = process(list);
308 if (error == 1) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
313 if(m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
315 m->mothurOutEndLine();
316 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
317 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
318 m->mothurOutEndLine();
323 catch(exception& e) {
324 m->errorOut(e, "BinSeqCommand", "execute");
329 //**********************************************************************************************************************
330 void BinSeqCommand::readNamesFile() {
332 vector<string> dupNames;
333 m->openInputFile(namesfile, inNames);
335 string name, names, sequence;
338 inNames >> name; //read from first column A
339 inNames >> names; //read from second column A,B,C,D
343 //parse names into vector
344 m->splitAtComma(names, dupNames);
346 //store names in fasta map
347 sequence = fasta->getSequence(name);
348 for (int i = 0; i < dupNames.size(); i++) {
349 fasta->push_back(dupNames[i], sequence);
357 catch(exception& e) {
358 m->errorOut(e, "BinSeqCommand", "readNamesFile");
362 //**********************************************************************************************************************
363 //return 1 if error, 0 otherwise
364 int BinSeqCommand::process(ListVector* list) {
366 string binnames, name, sequence;
368 string outputFileName = outputDir + m->getRootName(m->getSimpleName(globaldata->getListFile())) + list->getLabel() + ".fasta";
369 m->openOutputFile(outputFileName, out);
371 //save to output list of output file names
372 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
374 m->mothurOut(list->getLabel()); m->mothurOutEndLine();
376 //for each bin in the list vector
377 for (int i = 0; i < list->size(); i++) {
379 if (m->control_pressed) { return 1; }
381 binnames = list->get(i);
382 while (binnames.find_first_of(',') != -1) {
383 name = binnames.substr(0,binnames.find_first_of(','));
384 binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
386 //do work for that name
387 sequence = fasta->getSequence(name);
388 if (sequence != "not found") {
389 //if you don't have groups
390 if (groupfile == "") {
391 name = name + "|" + toString(i+1);
392 out << ">" << name << endl;
393 out << sequence << endl;
394 }else {//if you do have groups
395 string group = groupMap->getGroup(name);
396 if (group == "not found") {
397 m->mothurOut(name + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
400 name = name + "|" + group + "|" + toString(i+1);
401 out << ">" << name << endl;
402 out << sequence << endl;
406 m->mothurOut(name + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
413 sequence = fasta->getSequence(binnames);
414 if (sequence != "not found") {
415 //if you don't have groups
416 if (groupfile == "") {
417 binnames = binnames + "|" + toString(i+1);
418 out << ">" << binnames << endl;
419 out << sequence << endl;
420 }else {//if you do have groups
421 string group = groupMap->getGroup(binnames);
422 if (group == "not found") {
423 m->mothurOut(binnames + " is missing from your group file. Please correct. "); m->mothurOutEndLine();
426 binnames = binnames + "|" + group + "|" + toString(i+1);
427 out << ">" << binnames << endl;
428 out << sequence << endl;
432 m->mothurOut(binnames + " is missing from your fasta or name file. Please correct. "); m->mothurOutEndLine();
441 catch(exception& e) {
442 m->errorOut(e, "BinSeqCommand", "process");
446 //**********************************************************************************************************************