2 * removelineagecommand.cpp
5 * Created by westcott on 9/24/10.
6 * Copyright 2010 Schloss Lab. All rights reserved.
10 #include "removelineagecommand.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
14 //**********************************************************************************************************************
15 vector<string> RemoveLineageCommand::getValidParameters(){
17 string Array[] = {"fasta","name", "group", "alignreport", "taxon", "dups", "list","taxonomy","outputdir","inputdir"};
18 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
22 m->errorOut(e, "RemoveLineageCommand", "getValidParameters");
26 //**********************************************************************************************************************
27 RemoveLineageCommand::RemoveLineageCommand(){
29 //initialize outputTypes
30 vector<string> tempOutNames;
31 outputTypes["fasta"] = tempOutNames;
32 outputTypes["taxonomy"] = tempOutNames;
33 outputTypes["name"] = tempOutNames;
34 outputTypes["group"] = tempOutNames;
35 outputTypes["alignreport"] = tempOutNames;
36 outputTypes["list"] = tempOutNames;
39 m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand");
43 //**********************************************************************************************************************
44 vector<string> RemoveLineageCommand::getRequiredParameters(){
46 string Array[] = {"taxonomy"};
47 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
51 m->errorOut(e, "RemoveLineageCommand", "getRequiredParameters");
55 //**********************************************************************************************************************
56 vector<string> RemoveLineageCommand::getRequiredFiles(){
58 vector<string> myArray;
62 m->errorOut(e, "RemoveLineageCommand", "getRequiredFiles");
66 //**********************************************************************************************************************
67 RemoveLineageCommand::RemoveLineageCommand(string option) {
71 //allow user to run help
72 if(option == "help") { help(); abort = true; }
75 //valid paramters for this command
76 string Array[] = {"fasta","name", "group", "alignreport", "taxon", "dups", "list","taxonomy","outputdir","inputdir"};
77 vector<string> myArray (Array, Array+(sizeof(Array)/sizeof(string)));
79 OptionParser parser(option);
80 map<string,string> parameters = parser.getParameters();
82 ValidParameters validParameter;
83 map<string,string>::iterator it;
85 //check to make sure all parameters are valid for command
86 for (it = parameters.begin(); it != parameters.end(); it++) {
87 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
90 //initialize outputTypes
91 vector<string> tempOutNames;
92 outputTypes["fasta"] = tempOutNames;
93 outputTypes["taxonomy"] = tempOutNames;
94 outputTypes["name"] = tempOutNames;
95 outputTypes["group"] = tempOutNames;
96 outputTypes["alignreport"] = tempOutNames;
97 outputTypes["list"] = tempOutNames;
99 //if the user changes the output directory command factory will send this info to us in the output parameter
100 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
102 //if the user changes the input directory command factory will send this info to us in the output parameter
103 string inputDir = validParameter.validFile(parameters, "inputdir", false);
104 if (inputDir == "not found"){ inputDir = ""; }
107 it = parameters.find("alignreport");
108 //user has given a template file
109 if(it != parameters.end()){
110 path = m->hasPath(it->second);
111 //if the user has not given a path then, add inputdir. else leave path alone.
112 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
115 it = parameters.find("fasta");
116 //user has given a template file
117 if(it != parameters.end()){
118 path = m->hasPath(it->second);
119 //if the user has not given a path then, add inputdir. else leave path alone.
120 if (path == "") { parameters["fasta"] = inputDir + it->second; }
123 it = parameters.find("list");
124 //user has given a template file
125 if(it != parameters.end()){
126 path = m->hasPath(it->second);
127 //if the user has not given a path then, add inputdir. else leave path alone.
128 if (path == "") { parameters["list"] = inputDir + it->second; }
131 it = parameters.find("name");
132 //user has given a template file
133 if(it != parameters.end()){
134 path = m->hasPath(it->second);
135 //if the user has not given a path then, add inputdir. else leave path alone.
136 if (path == "") { parameters["name"] = inputDir + it->second; }
139 it = parameters.find("group");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["group"] = inputDir + it->second; }
147 it = parameters.find("taxonomy");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
157 //check for required parameters
158 fastafile = validParameter.validFile(parameters, "fasta", true);
159 if (fastafile == "not open") { abort = true; }
160 else if (fastafile == "not found") { fastafile = ""; }
162 namefile = validParameter.validFile(parameters, "name", true);
163 if (namefile == "not open") { abort = true; }
164 else if (namefile == "not found") { namefile = ""; }
166 groupfile = validParameter.validFile(parameters, "group", true);
167 if (groupfile == "not open") { abort = true; }
168 else if (groupfile == "not found") { groupfile = ""; }
170 alignfile = validParameter.validFile(parameters, "alignreport", true);
171 if (alignfile == "not open") { abort = true; }
172 else if (alignfile == "not found") { alignfile = ""; }
174 listfile = validParameter.validFile(parameters, "list", true);
175 if (listfile == "not open") { abort = true; }
176 else if (listfile == "not found") { listfile = ""; }
178 taxfile = validParameter.validFile(parameters, "taxonomy", true);
179 if (taxfile == "not open") { abort = true; }
180 else if (taxfile == "not found") { taxfile = ""; m->mothurOut("The taxonomy parameter is required for the get.lineage command."); m->mothurOutEndLine(); abort = true; }
182 string usedDups = "true";
183 string temp = validParameter.validFile(parameters, "dups", false); if (temp == "not found") { temp = "false"; usedDups = ""; }
184 dups = m->isTrue(temp);
186 taxons = validParameter.validFile(parameters, "taxon", false);
187 if (taxons == "not found") { taxons = ""; m->mothurOut("No taxons given, please correct."); m->mothurOutEndLine(); abort = true; }
190 if (taxons[0] == '\'') { taxons = taxons.substr(1); }
191 if (taxons[(taxons.length()-1)] == '\'') { taxons = taxons.substr(0, (taxons.length()-1)); }
195 if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "")) { m->mothurOut("You must provide one of the following: fasta, name, group, alignreport, taxonomy or listfile."); m->mothurOutEndLine(); abort = true; }
197 if ((usedDups != "") && (namefile == "")) { m->mothurOut("You may only use dups with the name option."); m->mothurOutEndLine(); abort = true; }
202 catch(exception& e) {
203 m->errorOut(e, "RemoveLineageCommand", "RemoveLineageCommand");
207 //**********************************************************************************************************************
209 void RemoveLineageCommand::help(){
211 m->mothurOut("The remove.lineage command reads a taxonomy file and any of the following file types: fasta, name, group, list or alignreport file.\n");
212 m->mothurOut("It outputs a file containing only the sequences from the taxonomy file that are not from the taxon you requested to be removed.\n");
213 m->mothurOut("The remove.lineage command parameters are taxon, fasta, name, group, list, taxonomy, alignreport and dups. You must provide taxonomy and taxon.\n");
214 m->mothurOut("The dups parameter allows you to add the entire line from a name file if you add any name from the line. default=false. \n");
215 m->mothurOut("The taxon parameter allows you to select the taxons you would like to remove.\n");
216 m->mothurOut("The remove.lineage command should be in the following format: remove.lineage(taxonomy=yourTaxonomyFile, taxon=yourTaxons).\n");
217 m->mothurOut("Example remove.lineage(taxonomy=amazon.silva.taxonomy, taxon=Bacteria;Firmicutes;Bacilli;Lactobacillales;).\n");
218 m->mothurOut("Note: If you are running mothur in script mode you must wrap the taxon in ' characters so mothur will ignore the ; in the taxon.\n");
219 m->mothurOut("Example remove.lineage(taxonomy=amazon.silva.taxonomy, taxon='Bacteria;Firmicutes;Bacilli;Lactobacillales;').\n");
220 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
222 catch(exception& e) {
223 m->errorOut(e, "RemoveLineageCommand", "help");
228 //**********************************************************************************************************************
230 int RemoveLineageCommand::execute(){
233 if (abort == true) { return 0; }
235 if (m->control_pressed) { return 0; }
237 //read through the correct file and output lines you want to keep
238 if (taxfile != "") { readTax(); } //fills the set of names to remove
239 if (namefile != "") { readName(); }
240 if (fastafile != "") { readFasta(); }
241 if (groupfile != "") { readGroup(); }
242 if (alignfile != "") { readAlign(); }
243 if (listfile != "") { readList(); }
246 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
248 if (outputNames.size() != 0) {
249 m->mothurOutEndLine();
250 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
251 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
252 m->mothurOutEndLine();
258 catch(exception& e) {
259 m->errorOut(e, "RemoveLineageCommand", "execute");
264 //**********************************************************************************************************************
265 int RemoveLineageCommand::readFasta(){
267 string thisOutputDir = outputDir;
268 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
269 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pick" + m->getExtension(fastafile);
272 m->openOutputFile(outputFileName, out);
275 m->openInputFile(fastafile, in);
278 bool wroteSomething = false;
281 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
283 Sequence currSeq(in);
284 name = currSeq.getName();
287 //if this name is in the accnos file
288 if (names.count(name) == 0) {
289 wroteSomething = true;
291 currSeq.printSequence(out);
299 if (wroteSomething == false) { m->mothurOut("Your fasta file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
300 outputNames.push_back(outputFileName); outputTypes["fasta"].push_back(outputFileName);
305 catch(exception& e) {
306 m->errorOut(e, "RemoveLineageCommand", "readFasta");
310 //**********************************************************************************************************************
311 int RemoveLineageCommand::readList(){
313 string thisOutputDir = outputDir;
314 if (outputDir == "") { thisOutputDir += m->hasPath(listfile); }
315 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "pick" + m->getExtension(listfile);
318 m->openOutputFile(outputFileName, out);
321 m->openInputFile(listfile, in);
323 bool wroteSomething = false;
326 //read in list vector
329 //make a new list vector
331 newList.setLabel(list.getLabel());
334 for (int i = 0; i < list.getNumBins(); i++) {
335 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
337 //parse out names that are in accnos file
338 string binnames = list.get(i);
340 string newNames = "";
341 while (binnames.find_first_of(',') != -1) {
342 string name = binnames.substr(0,binnames.find_first_of(','));
343 binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
345 //if that name is in the .accnos file, add it
346 if (names.count(name) == 0) { newNames += name + ","; }
350 if (names.count(binnames) == 0) { newNames += binnames + ","; }
352 //if there are names in this bin add to new list
353 if (newNames != "") {
354 newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
355 newList.push_back(newNames);
359 //print new listvector
360 if (newList.getNumBins() != 0) {
361 wroteSomething = true;
370 if (wroteSomething == false) { m->mothurOut("Your list file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
371 outputNames.push_back(outputFileName); outputTypes["list"].push_back(outputFileName);
376 catch(exception& e) {
377 m->errorOut(e, "RemoveLineageCommand", "readList");
381 //**********************************************************************************************************************
382 int RemoveLineageCommand::readName(){
384 string thisOutputDir = outputDir;
385 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
386 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pick" + m->getExtension(namefile);
389 m->openOutputFile(outputFileName, out);
392 m->openInputFile(namefile, in);
393 string name, firstCol, secondCol;
395 bool wroteSomething = false;
398 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
403 vector<string> parsedNames;
404 //parse second column saving each name
405 while (secondCol.find_first_of(',') != -1) {
406 name = secondCol.substr(0,secondCol.find_first_of(','));
407 secondCol = secondCol.substr(secondCol.find_first_of(',')+1, secondCol.length());
408 parsedNames.push_back(name);
411 //get name after last ,
412 parsedNames.push_back(secondCol);
414 vector<string> validSecond; validSecond.clear();
415 for (int i = 0; i < parsedNames.size(); i++) {
416 if (names.count(parsedNames[i]) == 0) {
417 validSecond.push_back(parsedNames[i]);
421 if ((dups) && (validSecond.size() != parsedNames.size())) { //if dups is true and we want to get rid of anyone, get rid of everyone
422 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
424 //if the name in the first column is in the set then print it and any other names in second column also in set
425 if (names.count(firstCol) == 0) {
427 wroteSomething = true;
429 out << firstCol << '\t';
431 //you know you have at least one valid second since first column is valid
432 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
433 out << validSecond[validSecond.size()-1] << endl;
435 //make first name in set you come to first column and then add the remaining names to second column
438 //you want part of this row
439 if (validSecond.size() != 0) {
441 wroteSomething = true;
443 out << validSecond[0] << '\t';
445 //you know you have at least one valid second since first column is valid
446 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
447 out << validSecond[validSecond.size()-1] << endl;
456 if (wroteSomething == false) { m->mothurOut("Your name file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
457 outputNames.push_back(outputFileName); outputTypes["name"].push_back(outputFileName);
461 catch(exception& e) {
462 m->errorOut(e, "RemoveLineageCommand", "readName");
467 //**********************************************************************************************************************
468 int RemoveLineageCommand::readGroup(){
470 string thisOutputDir = outputDir;
471 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
472 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile);
475 m->openOutputFile(outputFileName, out);
478 m->openInputFile(groupfile, in);
481 bool wroteSomething = false;
484 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
486 in >> name; //read from first column
487 in >> group; //read from second column
489 //if this name is in the accnos file
490 if (names.count(name) == 0) {
491 wroteSomething = true;
492 out << name << '\t' << group << endl;
500 if (wroteSomething == false) { m->mothurOut("Your group file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
501 outputNames.push_back(outputFileName); outputTypes["group"].push_back(outputFileName);
505 catch(exception& e) {
506 m->errorOut(e, "RemoveLineageCommand", "readGroup");
510 //**********************************************************************************************************************
511 int RemoveLineageCommand::readTax(){
513 string thisOutputDir = outputDir;
514 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
515 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile);
517 m->openOutputFile(outputFileName, out);
520 m->openInputFile(taxfile, in);
523 bool wroteSomething = false;
527 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
529 in >> name; //read from first column
530 in >> tax; //read from second column
534 //if the users file contains confidence scores we want to ignore them when searching for the taxons
535 int hasConfidences = tax.find_first_of('(');
536 if (hasConfidences != string::npos) {
537 newtax = removeConfidences(tax);
540 int pos = newtax.find(taxons);
542 if (pos == string::npos) {
543 wroteSomething = true;
544 out << name << '\t' << tax << endl;
545 }else{ //this sequence contains the taxon the user wants to remove
554 if (!wroteSomething) { m->mothurOut("Your taxonomy file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
555 outputNames.push_back(outputFileName); outputTypes["taxonomy"].push_back(outputFileName);
560 catch(exception& e) {
561 m->errorOut(e, "RemoveLineageCommand", "readTax");
565 /**************************************************************************************************/
566 string RemoveLineageCommand::removeConfidences(string tax) {
570 int taxLength = tax.length();
571 for(int i=0;i<taxLength;i++){
573 taxon = taxon.substr(0, taxon.find_first_of('(')); //rip off confidence
583 catch(exception& e) {
584 m->errorOut(e, "RemoveLineageCommand", "removeConfidences");
588 //**********************************************************************************************************************
589 //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
590 int RemoveLineageCommand::readAlign(){
592 string thisOutputDir = outputDir;
593 if (outputDir == "") { thisOutputDir += m->hasPath(alignfile); }
594 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(alignfile)) + "pick.align.report";
597 m->openOutputFile(outputFileName, out);
600 m->openInputFile(alignfile, in);
603 bool wroteSomething = false;
605 //read column headers
606 for (int i = 0; i < 16; i++) {
607 if (!in.eof()) { in >> junk; out << junk << '\t'; }
613 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
615 in >> name; //read from first column
617 //if this name is in the accnos file
618 if (names.count(name) == 0) {
619 wroteSomething = true;
624 for (int i = 0; i < 15; i++) {
625 if (!in.eof()) { in >> junk; out << junk << '\t'; }
630 }else {//still read just don't do anything with it
633 for (int i = 0; i < 15; i++) {
634 if (!in.eof()) { in >> junk; }
644 if (wroteSomething == false) { m->mothurOut("Your align file contains only sequences from " + taxons + "."); m->mothurOutEndLine(); }
645 outputNames.push_back(outputFileName); outputTypes["alignreport"].push_back(outputFileName);
650 catch(exception& e) {
651 m->errorOut(e, "RemoveLineageCommand", "readAlign");
655 //**********************************************************************************************************************