2 * removeseqscommand.cpp
5 * Created by Sarah Westcott on 7/8/09.
6 * Copyright 2009 Schloss Lab UMASS Amherst. All rights reserved.
10 #include "removeseqscommand.h"
11 #include "sequence.hpp"
12 #include "listvector.hpp"
14 //**********************************************************************************************************************
15 vector<string> RemoveSeqsCommand::setParameters(){
17 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pfasta);
18 CommandParameter pname("name", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pname);
19 CommandParameter pgroup("group", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pgroup);
20 CommandParameter plist("list", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(plist);
21 CommandParameter ptaxonomy("taxonomy", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(ptaxonomy);
22 CommandParameter palignreport("alignreport", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(palignreport);
23 CommandParameter pqfile("qfile", "InputTypes", "", "", "none", "FNGLT", "none",false,false); parameters.push_back(pqfile);
24 CommandParameter paccnos("accnos", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(paccnos);
25 CommandParameter pdups("dups", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pdups);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "RemoveSeqsCommand", "setParameters");
38 //**********************************************************************************************************************
39 string RemoveSeqsCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The remove.seqs command reads an .accnos file and at least one of the following file types: fasta, name, group, list, taxonomy, quality or alignreport file.\n";
43 helpString += "It outputs a file containing the sequences NOT in the .accnos file.\n";
44 helpString += "The remove.seqs command parameters are accnos, fasta, name, group, list, taxonomy, qfile, alignreport and dups. You must provide accnos and at least one of the file parameters.\n";
45 helpString += "The dups parameter allows you to remove the entire line from a name file if you remove any name from the line. default=true. \n";
46 helpString += "The remove.seqs command should be in the following format: remove.seqs(accnos=yourAccnos, fasta=yourFasta).\n";
47 helpString += "Example remove.seqs(accnos=amazon.accnos, fasta=amazon.fasta).\n";
48 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52 m->errorOut(e, "RemoveSeqsCommand", "getHelpString");
58 //**********************************************************************************************************************
59 RemoveSeqsCommand::RemoveSeqsCommand(){
61 abort = true; calledHelp = true;
63 vector<string> tempOutNames;
64 outputTypes["fasta"] = tempOutNames;
65 outputTypes["taxonomy"] = tempOutNames;
66 outputTypes["name"] = tempOutNames;
67 outputTypes["group"] = tempOutNames;
68 outputTypes["alignreport"] = tempOutNames;
69 outputTypes["list"] = tempOutNames;
70 outputTypes["qfile"] = tempOutNames;
73 m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand");
77 //**********************************************************************************************************************
78 RemoveSeqsCommand::RemoveSeqsCommand(string option) {
80 abort = false; calledHelp = false;
82 //allow user to run help
83 if(option == "help") { help(); abort = true; calledHelp = true; }
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
91 ValidParameters validParameter;
92 map<string,string>::iterator it;
94 //check to make sure all parameters are valid for command
95 for (it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["fasta"] = tempOutNames;
102 outputTypes["taxonomy"] = tempOutNames;
103 outputTypes["name"] = tempOutNames;
104 outputTypes["group"] = tempOutNames;
105 outputTypes["alignreport"] = tempOutNames;
106 outputTypes["list"] = tempOutNames;
107 outputTypes["qfile"] = tempOutNames;
109 //if the user changes the output directory command factory will send this info to us in the output parameter
110 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
112 //if the user changes the input directory command factory will send this info to us in the output parameter
113 string inputDir = validParameter.validFile(parameters, "inputdir", false);
114 if (inputDir == "not found"){ inputDir = ""; }
117 it = parameters.find("alignreport");
118 //user has given a template file
119 if(it != parameters.end()){
120 path = m->hasPath(it->second);
121 //if the user has not given a path then, add inputdir. else leave path alone.
122 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
125 it = parameters.find("fasta");
126 //user has given a template file
127 if(it != parameters.end()){
128 path = m->hasPath(it->second);
129 //if the user has not given a path then, add inputdir. else leave path alone.
130 if (path == "") { parameters["fasta"] = inputDir + it->second; }
133 it = parameters.find("accnos");
134 //user has given a template file
135 if(it != parameters.end()){
136 path = m->hasPath(it->second);
137 //if the user has not given a path then, add inputdir. else leave path alone.
138 if (path == "") { parameters["accnos"] = inputDir + it->second; }
141 it = parameters.find("list");
142 //user has given a template file
143 if(it != parameters.end()){
144 path = m->hasPath(it->second);
145 //if the user has not given a path then, add inputdir. else leave path alone.
146 if (path == "") { parameters["list"] = inputDir + it->second; }
149 it = parameters.find("name");
150 //user has given a template file
151 if(it != parameters.end()){
152 path = m->hasPath(it->second);
153 //if the user has not given a path then, add inputdir. else leave path alone.
154 if (path == "") { parameters["name"] = inputDir + it->second; }
157 it = parameters.find("group");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["group"] = inputDir + it->second; }
165 it = parameters.find("taxonomy");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
173 it = parameters.find("qfile");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["qfile"] = inputDir + it->second; }
183 //check for required parameters
184 accnosfile = validParameter.validFile(parameters, "accnos", true);
185 if (accnosfile == "not open") { abort = true; }
186 else if (accnosfile == "not found") {
187 accnosfile = m->getAccnosFile();
188 if (accnosfile != "") { m->mothurOut("Using " + accnosfile + " as input file for the accnos parameter."); m->mothurOutEndLine(); }
190 m->mothurOut("You have no valid accnos file and accnos is required."); m->mothurOutEndLine();
195 fastafile = validParameter.validFile(parameters, "fasta", true);
196 if (fastafile == "not open") { abort = true; }
197 else if (fastafile == "not found") { fastafile = ""; }
199 namefile = validParameter.validFile(parameters, "name", true);
200 if (namefile == "not open") { abort = true; }
201 else if (namefile == "not found") { namefile = ""; }
203 groupfile = validParameter.validFile(parameters, "group", true);
204 if (groupfile == "not open") { abort = true; }
205 else if (groupfile == "not found") { groupfile = ""; }
207 alignfile = validParameter.validFile(parameters, "alignreport", true);
208 if (alignfile == "not open") { abort = true; }
209 else if (alignfile == "not found") { alignfile = ""; }
211 listfile = validParameter.validFile(parameters, "list", true);
212 if (listfile == "not open") { abort = true; }
213 else if (listfile == "not found") { listfile = ""; }
215 taxfile = validParameter.validFile(parameters, "taxonomy", true);
216 if (taxfile == "not open") { abort = true; }
217 else if (taxfile == "not found") { taxfile = ""; }
219 qualfile = validParameter.validFile(parameters, "qfile", true);
220 if (qualfile == "not open") { abort = true; }
221 else if (qualfile == "not found") { qualfile = ""; }
224 string usedDups = "true";
225 string temp = validParameter.validFile(parameters, "dups", false);
226 if (temp == "not found") {
227 if (namefile != "") { temp = "true"; }
228 else { temp = "false"; usedDups = ""; }
230 dups = m->isTrue(temp);
232 if ((fastafile == "") && (namefile == "") && (groupfile == "") && (alignfile == "") && (listfile == "") && (taxfile == "") && (qualfile == "")) { m->mothurOut("You must provide at least one of the following: fasta, name, group, taxonomy, quality, alignreport or list."); m->mothurOutEndLine(); abort = true; }
237 catch(exception& e) {
238 m->errorOut(e, "RemoveSeqsCommand", "RemoveSeqsCommand");
242 //**********************************************************************************************************************
244 int RemoveSeqsCommand::execute(){
247 if (abort == true) { if (calledHelp) { return 0; } return 2; }
249 //get names you want to keep
252 if (m->control_pressed) { return 0; }
254 //read through the correct file and output lines you want to keep
255 if (namefile != "") { readName(); }
256 if (fastafile != "") { readFasta(); }
257 if (groupfile != "") { readGroup(); }
258 if (alignfile != "") { readAlign(); }
259 if (listfile != "") { readList(); }
260 if (taxfile != "") { readTax(); }
261 if (qualfile != "") { readQual(); }
263 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { remove(outputNames[i].c_str()); } return 0; }
265 m->mothurOut("Removed " + toString(names.size()) + " sequences."); m->mothurOutEndLine();
267 if (outputNames.size() != 0) {
268 m->mothurOutEndLine();
269 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
270 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
271 m->mothurOutEndLine();
273 //set fasta file as new current fastafile
275 itTypes = outputTypes.find("fasta");
276 if (itTypes != outputTypes.end()) {
277 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
280 itTypes = outputTypes.find("name");
281 if (itTypes != outputTypes.end()) {
282 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
285 itTypes = outputTypes.find("group");
286 if (itTypes != outputTypes.end()) {
287 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
290 itTypes = outputTypes.find("list");
291 if (itTypes != outputTypes.end()) {
292 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setListFile(current); }
295 itTypes = outputTypes.find("taxonomy");
296 if (itTypes != outputTypes.end()) {
297 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
300 itTypes = outputTypes.find("qfile");
301 if (itTypes != outputTypes.end()) {
302 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setQualFile(current); }
309 catch(exception& e) {
310 m->errorOut(e, "RemoveSeqsCommand", "execute");
315 //**********************************************************************************************************************
316 int RemoveSeqsCommand::readFasta(){
318 string thisOutputDir = outputDir;
319 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
320 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pick" + m->getExtension(fastafile);
323 m->openOutputFile(outputFileName, out);
326 m->openInputFile(fastafile, in);
329 bool wroteSomething = false;
332 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
334 Sequence currSeq(in);
335 name = currSeq.getName();
338 //if this name is in the accnos file
339 if (names.count(name) == 0) {
340 wroteSomething = true;
342 currSeq.printSequence(out);
350 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
351 outputTypes["fasta"].push_back(outputFileName); outputNames.push_back(outputFileName);
356 catch(exception& e) {
357 m->errorOut(e, "RemoveSeqsCommand", "readFasta");
361 //**********************************************************************************************************************
362 int RemoveSeqsCommand::readQual(){
364 string thisOutputDir = outputDir;
365 if (outputDir == "") { thisOutputDir += m->hasPath(qualfile); }
366 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(qualfile)) + "pick" + m->getExtension(qualfile);
368 m->openOutputFile(outputFileName, out);
372 m->openInputFile(qualfile, in);
375 bool wroteSomething = false;
379 string saveName = "";
385 if (name.length() != 0) {
386 saveName = name.substr(1);
389 if (c == 10 || c == 13){ break; }
396 char letter= in.get();
397 if(letter == '>'){ in.putback(letter); break; }
398 else{ scores += letter; }
403 if (names.count(saveName) == 0) {
404 wroteSomething = true;
406 out << name << endl << scores;
415 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
416 outputNames.push_back(outputFileName); outputTypes["qfile"].push_back(outputFileName);
421 catch(exception& e) {
422 m->errorOut(e, "RemoveSeqsCommand", "readQual");
426 //**********************************************************************************************************************
427 int RemoveSeqsCommand::readList(){
429 string thisOutputDir = outputDir;
430 if (outputDir == "") { thisOutputDir += m->hasPath(listfile); }
431 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(listfile)) + "pick" + m->getExtension(listfile);
434 m->openOutputFile(outputFileName, out);
437 m->openInputFile(listfile, in);
439 bool wroteSomething = false;
442 //read in list vector
445 //make a new list vector
447 newList.setLabel(list.getLabel());
450 for (int i = 0; i < list.getNumBins(); i++) {
451 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
453 //parse out names that are in accnos file
454 string binnames = list.get(i);
456 string newNames = "";
457 while (binnames.find_first_of(',') != -1) {
458 string name = binnames.substr(0,binnames.find_first_of(','));
459 binnames = binnames.substr(binnames.find_first_of(',')+1, binnames.length());
461 //if that name is in the .accnos file, add it
462 if (names.count(name) == 0) { newNames += name + ","; }
466 if (names.count(binnames) == 0) { newNames += binnames + ","; }
468 //if there are names in this bin add to new list
469 if (newNames != "") {
470 newNames = newNames.substr(0, newNames.length()-1); //rip off extra comma
471 newList.push_back(newNames);
475 //print new listvector
476 if (newList.getNumBins() != 0) {
477 wroteSomething = true;
486 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
487 outputTypes["list"].push_back(outputFileName); outputNames.push_back(outputFileName);
492 catch(exception& e) {
493 m->errorOut(e, "RemoveSeqsCommand", "readList");
497 //**********************************************************************************************************************
498 int RemoveSeqsCommand::readName(){
500 string thisOutputDir = outputDir;
501 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
502 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pick" + m->getExtension(namefile);
505 m->openOutputFile(outputFileName, out);
508 m->openInputFile(namefile, in);
509 string name, firstCol, secondCol;
511 bool wroteSomething = false;
515 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
517 in >> firstCol; m->gobble(in);
520 vector<string> parsedNames;
521 m->splitAtComma(secondCol, parsedNames);
523 vector<string> validSecond; validSecond.clear();
524 for (int i = 0; i < parsedNames.size(); i++) {
525 if (names.count(parsedNames[i]) == 0) {
526 validSecond.push_back(parsedNames[i]);
530 if ((dups) && (validSecond.size() != parsedNames.size())) { //if dups is true and we want to get rid of anyone, get rid of everyone
531 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
533 //if the name in the first column is in the set then print it and any other names in second column also in set
534 if (names.count(firstCol) == 0) {
536 wroteSomething = true;
538 out << firstCol << '\t';
540 //you know you have at least one valid second since first column is valid
541 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
542 out << validSecond[validSecond.size()-1] << endl;
544 //make first name in set you come to first column and then add the remaining names to second column
547 //you want part of this row
548 if (validSecond.size() != 0) {
550 wroteSomething = true;
552 out << validSecond[0] << '\t';
554 //you know you have at least one valid second since first column is valid
555 for (int i = 0; i < validSecond.size()-1; i++) { out << validSecond[i] << ','; }
556 out << validSecond[validSecond.size()-1] << endl;
565 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
566 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
570 catch(exception& e) {
571 m->errorOut(e, "RemoveSeqsCommand", "readName");
576 //**********************************************************************************************************************
577 int RemoveSeqsCommand::readGroup(){
579 string thisOutputDir = outputDir;
580 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
581 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pick" + m->getExtension(groupfile);
584 m->openOutputFile(outputFileName, out);
587 m->openInputFile(groupfile, in);
590 bool wroteSomething = false;
593 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
595 in >> name; //read from first column
596 in >> group; //read from second column
598 //if this name is in the accnos file
599 if (names.count(name) == 0) {
600 wroteSomething = true;
601 out << name << '\t' << group << endl;
609 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
610 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
614 catch(exception& e) {
615 m->errorOut(e, "RemoveSeqsCommand", "readGroup");
619 //**********************************************************************************************************************
620 int RemoveSeqsCommand::readTax(){
622 string thisOutputDir = outputDir;
623 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
624 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pick" + m->getExtension(taxfile);
626 m->openOutputFile(outputFileName, out);
629 m->openInputFile(taxfile, in);
632 bool wroteSomething = false;
635 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
637 in >> name; //read from first column
638 in >> tax; //read from second column
640 //if this name is in the accnos file
641 if (names.count(name) == 0) {
642 wroteSomething = true;
643 out << name << '\t' << tax << endl;
651 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
652 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
656 catch(exception& e) {
657 m->errorOut(e, "RemoveSeqsCommand", "readTax");
661 //**********************************************************************************************************************
662 //alignreport file has a column header line then all other lines contain 16 columns. we just want the first column since that contains the name
663 int RemoveSeqsCommand::readAlign(){
665 string thisOutputDir = outputDir;
666 if (outputDir == "") { thisOutputDir += m->hasPath(alignfile); }
667 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(alignfile)) + "pick.align.report";
670 m->openOutputFile(outputFileName, out);
673 m->openInputFile(alignfile, in);
676 bool wroteSomething = false;
678 //read column headers
679 for (int i = 0; i < 16; i++) {
680 if (!in.eof()) { in >> junk; out << junk << '\t'; }
686 if (m->control_pressed) { in.close(); out.close(); remove(outputFileName.c_str()); return 0; }
688 in >> name; //read from first column
690 //if this name is in the accnos file
691 if (names.count(name) == 0) {
692 wroteSomething = true;
697 for (int i = 0; i < 15; i++) {
698 if (!in.eof()) { in >> junk; out << junk << '\t'; }
703 }else {//still read just don't do anything with it
706 for (int i = 0; i < 15; i++) {
707 if (!in.eof()) { in >> junk; }
717 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
718 outputTypes["alignreport"].push_back(outputFileName); outputNames.push_back(outputFileName);
723 catch(exception& e) {
724 m->errorOut(e, "RemoveSeqsCommand", "readAlign");
728 //**********************************************************************************************************************
729 void RemoveSeqsCommand::readAccnos(){
733 m->openInputFile(accnosfile, in);
746 catch(exception& e) {
747 m->errorOut(e, "RemoveSeqsCommand", "readAccnos");
752 //**********************************************************************************************************************