2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //***************************************************************************************************************
15 ScreenSeqsCommand::ScreenSeqsCommand(string option){
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string AlignArray[] = {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25 "name", "group", "alignreport","outputdir","inputdir"};
26 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
28 OptionParser parser(option);
29 map<string,string> parameters = parser.getParameters();
31 ValidParameters validParameter;
32 map<string,string>::iterator it;
34 //check to make sure all parameters are valid for command
35 for (it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //if the user changes the input directory command factory will send this info to us in the output parameter
40 string inputDir = validParameter.validFile(parameters, "inputdir", false);
41 if (inputDir == "not found"){ inputDir = ""; }
44 it = parameters.find("fasta");
45 //user has given a template file
46 if(it != parameters.end()){
47 path = hasPath(it->second);
48 //if the user has not given a path then, add inputdir. else leave path alone.
49 if (path == "") { parameters["fasta"] = inputDir + it->second; }
52 it = parameters.find("group");
53 //user has given a template file
54 if(it != parameters.end()){
55 path = hasPath(it->second);
56 //if the user has not given a path then, add inputdir. else leave path alone.
57 if (path == "") { parameters["group"] = inputDir + it->second; }
60 it = parameters.find("name");
61 //user has given a template file
62 if(it != parameters.end()){
63 path = hasPath(it->second);
64 //if the user has not given a path then, add inputdir. else leave path alone.
65 if (path == "") { parameters["name"] = inputDir + it->second; }
68 it = parameters.find("alignreport");
69 //user has given a template file
70 if(it != parameters.end()){
71 path = hasPath(it->second);
72 //if the user has not given a path then, add inputdir. else leave path alone.
73 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
77 //check for required parameters
78 fastafile = validParameter.validFile(parameters, "fasta", true);
79 if (fastafile == "not found") { mothurOut("fasta is a required parameter for the screen.seqs command."); mothurOutEndLine(); abort = true; }
80 else if (fastafile == "not open") { abort = true; }
82 groupfile = validParameter.validFile(parameters, "group", true);
83 if (groupfile == "not open") { abort = true; }
84 else if (groupfile == "not found") { groupfile = ""; }
86 namefile = validParameter.validFile(parameters, "name", true);
87 if (namefile == "not open") { abort = true; }
88 else if (namefile == "not found") { namefile = ""; }
90 alignreport = validParameter.validFile(parameters, "alignreport", true);
91 if (alignreport == "not open") { abort = true; }
92 else if (alignreport == "not found") { alignreport = ""; }
94 //if the user changes the output directory command factory will send this info to us in the output parameter
95 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
97 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
100 //check for optional parameter and set defaults
101 // ...at some point should added some additional type checking...
103 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
104 convert(temp, startPos);
106 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
107 convert(temp, endPos);
109 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
110 convert(temp, maxAmbig);
112 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
113 convert(temp, maxHomoP);
115 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
116 convert(temp, minLength);
118 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
119 convert(temp, maxLength);
123 catch(exception& e) {
124 errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
128 //**********************************************************************************************************************
130 void ScreenSeqsCommand::help(){
132 mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
133 mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, and group.\n");
134 mothurOut("The fasta parameter is required.\n");
135 mothurOut("The start parameter .... The default is -1.\n");
136 mothurOut("The end parameter .... The default is -1.\n");
137 mothurOut("The maxambig parameter .... The default is -1.\n");
138 mothurOut("The maxhomop parameter .... The default is -1.\n");
139 mothurOut("The minlength parameter .... The default is -1.\n");
140 mothurOut("The maxlength parameter .... The default is -1.\n");
141 mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
142 mothurOut("The screen.seqs command should be in the following format: \n");
143 mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n");
144 mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
145 mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
146 mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
149 catch(exception& e) {
150 errorOut(e, "ScreenSeqsCommand", "help");
155 //***************************************************************************************************************
157 ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ }
159 //***************************************************************************************************************
161 int ScreenSeqsCommand::execute(){
164 if (abort == true) { return 0; }
167 openInputFile(fastafile, inFASTA);
169 set<string> badSeqNames;
171 string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
172 string badSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
174 ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut);
175 ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut);
177 while(!inFASTA.eof()){
178 Sequence currSeq(inFASTA);
179 if (currSeq.getName() != "") {
180 bool goodSeq = 1; // innocent until proven guilty
181 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
182 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
183 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
184 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
185 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
186 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
189 currSeq.printSequence(goodSeqOut);
192 currSeq.printSequence(badSeqOut);
193 badSeqNames.insert(currSeq.getName());
198 if(namefile != "" && groupfile != "") { screenNameGroupFile(badSeqNames); } // this screens both names and groups
199 else if(namefile != "") { screenNameGroupFile(badSeqNames); }
200 else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the groups
201 if(alignreport != "") { screenAlignReport(badSeqNames); }
208 catch(exception& e) {
209 errorOut(e, "ScreenSeqsCommand", "execute");
214 //***************************************************************************************************************
216 void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
219 openInputFile(namefile, inputNames);
220 set<string> badSeqGroups;
221 string seqName, seqList, group;
222 set<string>::iterator it;
224 string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
225 string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
227 ofstream goodNameOut; openOutputFile(goodNameFile, goodNameOut);
228 ofstream badNameOut; openOutputFile(badNameFile, badNameOut);
230 while(!inputNames.eof()){
231 inputNames >> seqName >> seqList;
232 it = badSeqNames.find(seqName);
234 if(it != badSeqNames.end()){
235 badSeqNames.erase(it);
236 badNameOut << seqName << '\t' << seqList << endl;
239 for(int i=0;i<seqList.length();i++){
240 if(seqList[i] == ','){
241 badSeqGroups.insert(seqList.substr(start,i-start));
245 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
249 goodNameOut << seqName << '\t' << seqList << endl;
257 //we were unable to remove some of the bad sequences
258 if (badSeqNames.size() != 0) {
259 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
260 mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
267 ifstream inputGroups;
268 openInputFile(groupfile, inputGroups);
270 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
271 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
273 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
274 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
276 while(!inputGroups.eof()){
277 inputGroups >> seqName >> group;
279 it = badSeqGroups.find(seqName);
281 if(it != badSeqGroups.end()){
282 badSeqGroups.erase(it);
283 badGroupOut << seqName << '\t' << group << endl;
286 goodGroupOut << seqName << '\t' << group << endl;
291 goodGroupOut.close();
294 //we were unable to remove some of the bad sequences
295 if (badSeqGroups.size() != 0) {
296 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
297 mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
304 //***************************************************************************************************************
306 void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
308 ifstream inputGroups;
309 openInputFile(groupfile, inputGroups);
310 string seqName, group;
311 set<string>::iterator it;
313 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
314 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
316 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
317 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
319 while(!inputGroups.eof()){
320 inputGroups >> seqName >> group;
321 it = badSeqNames.find(seqName);
323 if(it != badSeqNames.end()){
324 badSeqNames.erase(it);
325 badGroupOut << seqName << '\t' << group << endl;
328 goodGroupOut << seqName << '\t' << group << endl;
333 //we were unable to remove some of the bad sequences
334 if (badSeqNames.size() != 0) {
335 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
336 mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
342 goodGroupOut.close();
347 //***************************************************************************************************************
349 void ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
351 ifstream inputAlignReport;
352 openInputFile(alignreport, inputAlignReport);
353 string seqName, group;
354 set<string>::iterator it;
356 string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
357 string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
359 ofstream goodAlignReportOut; openOutputFile(goodAlignReportFile, goodAlignReportOut);
360 ofstream badAlignReportOut; openOutputFile(badAlignReportFile, badAlignReportOut);
362 while (!inputAlignReport.eof()) { // need to copy header
363 char c = inputAlignReport.get();
364 goodAlignReportOut << c;
365 badAlignReportOut << c;
366 if (c == 10 || c == 13){ break; }
369 while(!inputAlignReport.eof()){
370 inputAlignReport >> seqName;
371 it = badSeqNames.find(seqName);
373 while (!inputAlignReport.eof()) { // need to copy header
374 char c = inputAlignReport.get();
376 if (c == 10 || c == 13){ break; }
379 if(it != badSeqNames.end()){
380 badSeqNames.erase(it);
381 badAlignReportOut << seqName << '\t' << line;;
384 goodAlignReportOut << seqName << '\t' << line;
386 gobble(inputAlignReport);
389 //we were unable to remove some of the bad sequences
390 if (badSeqNames.size() != 0) {
391 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
392 mothurOut("Your file does not include the sequence " + *it + " please correct.");
397 inputAlignReport.close();
398 goodAlignReportOut.close();
399 badAlignReportOut.close();
403 //***************************************************************************************************************