2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //***************************************************************************************************************
15 ScreenSeqsCommand::ScreenSeqsCommand(string option) {
19 //allow user to run help
20 if(option == "help") { help(); abort = true; }
23 //valid paramters for this command
24 string AlignArray[] = {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength",
25 "name", "group", "alignreport","outputdir","inputdir"};
26 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
28 OptionParser parser(option);
29 map<string,string> parameters = parser.getParameters();
31 ValidParameters validParameter;
32 map<string,string>::iterator it;
34 //check to make sure all parameters are valid for command
35 for (it = parameters.begin(); it != parameters.end(); it++) {
36 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
39 //if the user changes the input directory command factory will send this info to us in the output parameter
40 string inputDir = validParameter.validFile(parameters, "inputdir", false);
41 if (inputDir == "not found"){ inputDir = ""; }
44 it = parameters.find("fasta");
45 //user has given a template file
46 if(it != parameters.end()){
47 path = hasPath(it->second);
48 //if the user has not given a path then, add inputdir. else leave path alone.
49 if (path == "") { parameters["fasta"] = inputDir + it->second; }
52 it = parameters.find("group");
53 //user has given a template file
54 if(it != parameters.end()){
55 path = hasPath(it->second);
56 //if the user has not given a path then, add inputdir. else leave path alone.
57 if (path == "") { parameters["group"] = inputDir + it->second; }
60 it = parameters.find("name");
61 //user has given a template file
62 if(it != parameters.end()){
63 path = hasPath(it->second);
64 //if the user has not given a path then, add inputdir. else leave path alone.
65 if (path == "") { parameters["name"] = inputDir + it->second; }
68 it = parameters.find("alignreport");
69 //user has given a template file
70 if(it != parameters.end()){
71 path = hasPath(it->second);
72 //if the user has not given a path then, add inputdir. else leave path alone.
73 if (path == "") { parameters["alignreport"] = inputDir + it->second; }
77 //check for required parameters
78 fastafile = validParameter.validFile(parameters, "fasta", true);
79 if (fastafile == "not found") { m->mothurOut("fasta is a required parameter for the screen.seqs command."); m->mothurOutEndLine(); abort = true; }
80 else if (fastafile == "not open") { abort = true; }
82 groupfile = validParameter.validFile(parameters, "group", true);
83 if (groupfile == "not open") { abort = true; }
84 else if (groupfile == "not found") { groupfile = ""; }
86 namefile = validParameter.validFile(parameters, "name", true);
87 if (namefile == "not open") { abort = true; }
88 else if (namefile == "not found") { namefile = ""; }
90 alignreport = validParameter.validFile(parameters, "alignreport", true);
91 if (alignreport == "not open") { abort = true; }
92 else if (alignreport == "not found") { alignreport = ""; }
94 //if the user changes the output directory command factory will send this info to us in the output parameter
95 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){
97 outputDir += hasPath(fastafile); //if user entered a file with a path then preserve it
100 //check for optional parameter and set defaults
101 // ...at some point should added some additional type checking...
103 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
104 convert(temp, startPos);
106 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
107 convert(temp, endPos);
109 temp = validParameter.validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
110 convert(temp, maxAmbig);
112 temp = validParameter.validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
113 convert(temp, maxHomoP);
115 temp = validParameter.validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
116 convert(temp, minLength);
118 temp = validParameter.validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
119 convert(temp, maxLength);
123 catch(exception& e) {
124 m->errorOut(e, "ScreenSeqsCommand", "ScreenSeqsCommand");
128 //**********************************************************************************************************************
130 void ScreenSeqsCommand::help(){
132 m->mothurOut("The screen.seqs command reads a fastafile and creates .....\n");
133 m->mothurOut("The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, and group.\n");
134 m->mothurOut("The fasta parameter is required.\n");
135 m->mothurOut("The start parameter .... The default is -1.\n");
136 m->mothurOut("The end parameter .... The default is -1.\n");
137 m->mothurOut("The maxambig parameter .... The default is -1.\n");
138 m->mothurOut("The maxhomop parameter .... The default is -1.\n");
139 m->mothurOut("The minlength parameter .... The default is -1.\n");
140 m->mothurOut("The maxlength parameter .... The default is -1.\n");
141 m->mothurOut("The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile.\n");
142 m->mothurOut("The screen.seqs command should be in the following format: \n");
143 m->mothurOut("screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, \n");
144 m->mothurOut("maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) \n");
145 m->mothurOut("Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...).\n");
146 m->mothurOut("Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n\n");
149 catch(exception& e) {
150 m->errorOut(e, "ScreenSeqsCommand", "help");
155 //***************************************************************************************************************
157 ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ }
159 //***************************************************************************************************************
161 int ScreenSeqsCommand::execute(){
164 if (abort == true) { return 0; }
167 openInputFile(fastafile, inFASTA);
169 set<string> badSeqNames;
171 string goodSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "good" + getExtension(fastafile);
172 string badSeqFile = outputDir + getRootName(getSimpleName(fastafile)) + "bad" + getExtension(fastafile);
174 ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut);
175 ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut);
177 while(!inFASTA.eof()){
178 Sequence currSeq(inFASTA);
179 if (currSeq.getName() != "") {
180 bool goodSeq = 1; // innocent until proven guilty
181 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
182 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
183 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
184 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
185 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
186 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
189 currSeq.printSequence(goodSeqOut);
192 currSeq.printSequence(badSeqOut);
193 badSeqNames.insert(currSeq.getName());
198 if(namefile != "" && groupfile != "") { screenNameGroupFile(badSeqNames); } // this screens both names and groups
199 else if(namefile != "") { screenNameGroupFile(badSeqNames); }
200 else if(groupfile != "") { screenGroupFile(badSeqNames); } // this screens just the groups
201 if(alignreport != "") { screenAlignReport(badSeqNames); }
207 m->mothurOutEndLine();
208 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
209 m->mothurOut(goodSeqFile); m->mothurOutEndLine();
210 m->mothurOut(badSeqFile); m->mothurOutEndLine();
211 m->mothurOutEndLine();
216 catch(exception& e) {
217 m->errorOut(e, "ScreenSeqsCommand", "execute");
222 //***************************************************************************************************************
224 void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
227 openInputFile(namefile, inputNames);
228 set<string> badSeqGroups;
229 string seqName, seqList, group;
230 set<string>::iterator it;
232 string goodNameFile = outputDir + getRootName(getSimpleName(namefile)) + "good" + getExtension(namefile);
233 string badNameFile = outputDir + getRootName(getSimpleName(namefile)) + "bad" + getExtension(namefile);
235 ofstream goodNameOut; openOutputFile(goodNameFile, goodNameOut);
236 ofstream badNameOut; openOutputFile(badNameFile, badNameOut);
238 while(!inputNames.eof()){
239 inputNames >> seqName >> seqList;
240 it = badSeqNames.find(seqName);
242 if(it != badSeqNames.end()){
243 badSeqNames.erase(it);
244 badNameOut << seqName << '\t' << seqList << endl;
247 for(int i=0;i<seqList.length();i++){
248 if(seqList[i] == ','){
249 badSeqGroups.insert(seqList.substr(start,i-start));
253 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
257 goodNameOut << seqName << '\t' << seqList << endl;
265 //we were unable to remove some of the bad sequences
266 if (badSeqNames.size() != 0) {
267 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
268 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
269 m->mothurOutEndLine();
275 ifstream inputGroups;
276 openInputFile(groupfile, inputGroups);
278 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
279 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
281 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
282 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
284 while(!inputGroups.eof()){
285 inputGroups >> seqName >> group;
287 it = badSeqGroups.find(seqName);
289 if(it != badSeqGroups.end()){
290 badSeqGroups.erase(it);
291 badGroupOut << seqName << '\t' << group << endl;
294 goodGroupOut << seqName << '\t' << group << endl;
299 goodGroupOut.close();
302 //we were unable to remove some of the bad sequences
303 if (badSeqGroups.size() != 0) {
304 for (it = badSeqGroups.begin(); it != badSeqGroups.end(); it++) {
305 m->mothurOut("Your namefile does not include the sequence " + *it + " please correct.");
306 m->mothurOutEndLine();
312 //***************************************************************************************************************
314 void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
316 ifstream inputGroups;
317 openInputFile(groupfile, inputGroups);
318 string seqName, group;
319 set<string>::iterator it;
321 string goodGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "good" + getExtension(groupfile);
322 string badGroupFile = outputDir + getRootName(getSimpleName(groupfile)) + "bad" + getExtension(groupfile);
324 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
325 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
327 while(!inputGroups.eof()){
328 inputGroups >> seqName >> group;
329 it = badSeqNames.find(seqName);
331 if(it != badSeqNames.end()){
332 badSeqNames.erase(it);
333 badGroupOut << seqName << '\t' << group << endl;
336 goodGroupOut << seqName << '\t' << group << endl;
341 //we were unable to remove some of the bad sequences
342 if (badSeqNames.size() != 0) {
343 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
344 m->mothurOut("Your groupfile does not include the sequence " + *it + " please correct.");
345 m->mothurOutEndLine();
350 goodGroupOut.close();
355 //***************************************************************************************************************
357 void ScreenSeqsCommand::screenAlignReport(set<string> badSeqNames){
359 ifstream inputAlignReport;
360 openInputFile(alignreport, inputAlignReport);
361 string seqName, group;
362 set<string>::iterator it;
364 string goodAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "good" + getExtension(alignreport);
365 string badAlignReportFile = outputDir + getRootName(getSimpleName(alignreport)) + "bad" + getExtension(alignreport);
367 ofstream goodAlignReportOut; openOutputFile(goodAlignReportFile, goodAlignReportOut);
368 ofstream badAlignReportOut; openOutputFile(badAlignReportFile, badAlignReportOut);
370 while (!inputAlignReport.eof()) { // need to copy header
371 char c = inputAlignReport.get();
372 goodAlignReportOut << c;
373 badAlignReportOut << c;
374 if (c == 10 || c == 13){ break; }
377 while(!inputAlignReport.eof()){
378 inputAlignReport >> seqName;
379 it = badSeqNames.find(seqName);
381 while (!inputAlignReport.eof()) { // need to copy header
382 char c = inputAlignReport.get();
384 if (c == 10 || c == 13){ break; }
387 if(it != badSeqNames.end()){
388 badSeqNames.erase(it);
389 badAlignReportOut << seqName << '\t' << line;;
392 goodAlignReportOut << seqName << '\t' << line;
394 gobble(inputAlignReport);
397 //we were unable to remove some of the bad sequences
398 if (badSeqNames.size() != 0) {
399 for (it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
400 m->mothurOut("Your file does not include the sequence " + *it + " please correct.");
401 m->mothurOutEndLine();
405 inputAlignReport.close();
406 goodAlignReportOut.close();
407 badAlignReportOut.close();
411 //***************************************************************************************************************