2 * screenseqscommand.cpp
5 * Created by Pat Schloss on 6/3/09.
6 * Copyright 2009 Patrick D. Schloss. All rights reserved.
10 #include "screenseqscommand.h"
11 #include "sequence.hpp"
13 //***************************************************************************************************************
15 ScreenSeqsCommand::ScreenSeqsCommand(string option){
17 globaldata = GlobalData::getInstance();
20 //allow user to run help
21 if(option == "help") { help(); abort = true; }
24 //valid paramters for this command
25 string AlignArray[] = {"fasta", "start", "end", "maxambig", "maxhomop", "minlength", "maxlength", "name", "group"};
26 vector<string> myArray (AlignArray, AlignArray+(sizeof(AlignArray)/sizeof(string)));
28 parser = new OptionParser();
29 parser->parse(option, parameters); delete parser;
31 ValidParameters* validParameter = new ValidParameters();
33 //check to make sure all parameters are valid for command
34 for (it = parameters.begin(); it != parameters.end(); it++) {
35 if (validParameter->isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
38 //check for required parameters
39 fastafile = validParameter->validFile(parameters, "fasta", true);
40 if (fastafile == "not found") { cout << "fasta is a required parameter for the screen.seqs command." << endl; abort = true; }
41 else if (fastafile == "not open") { abort = true; }
42 else { globaldata->setFastaFile(fastafile); }
44 groupfile = validParameter->validFile(parameters, "group", true);
45 if (groupfile == "not open") { abort = true; }
46 else if (groupfile == "not found") { groupfile = ""; }
48 globaldata->setGroupFile(groupfile);
51 namefile = validParameter->validFile(parameters, "name", true);
52 if (namefile == "not open") { abort = true; }
53 else if (namefile == "not found") { namefile = ""; }
55 globaldata->setNameFile(namefile);
59 //check for optional parameter and set defaults
60 // ...at some point should added some additional type checking...
62 temp = validParameter->validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
63 convert(temp, startPos);
65 temp = validParameter->validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
66 convert(temp, endPos);
68 temp = validParameter->validFile(parameters, "maxambig", false); if (temp == "not found") { temp = "-1"; }
69 convert(temp, maxAmbig);
71 temp = validParameter->validFile(parameters, "maxhomop", false); if (temp == "not found") { temp = "-1"; }
72 convert(temp, maxHomoP);
74 temp = validParameter->validFile(parameters, "minlength", false); if (temp == "not found") { temp = "-1"; }
75 convert(temp, minLength);
77 temp = validParameter->validFile(parameters, "maxlength", false); if (temp == "not found") { temp = "-1"; }
78 convert(temp, maxLength);
80 delete validParameter;
85 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
89 cout << "An unknown error has occurred in the ScreenSeqsCommand class function ScreenSeqsCommand. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
93 //**********************************************************************************************************************
95 void ScreenSeqsCommand::help(){
97 cout << "The screen.seqs command reads a fastafile and creates ....." << "\n";
98 cout << "The screen.seqs command parameters are fasta, start, end, maxambig, maxhomop, minlength, maxlength, name, and group." << "\n";
99 cout << "The fasta parameter is required." << "\n";
100 cout << "The start parameter .... The default is -1." << "\n";
101 cout << "The end parameter .... The default is -1." << "\n";
102 cout << "The maxambig parameter .... The default is -1." << "\n";
103 cout << "The maxhomop parameter .... The default is -1." << "\n";
104 cout << "The minlength parameter .... The default is -1." << "\n";
105 cout << "The maxlength parameter .... The default is -1." << "\n";
106 cout << "The name parameter allows you to provide a namesfile, and the group parameter allows you to provide a groupfile." << "\n";
107 cout << "The screen.seqs command should be in the following format: " << "\n";
108 cout << "screen.seqs(fasta=yourFastaFile, name=youNameFile, group=yourGroupFIle, start=yourStart, end=yourEnd, maxambig=yourMaxambig, " << "\n";
109 cout << "maxhomop=yourMaxhomop, minlength=youMinlength, maxlength=yourMaxlength) " << "\n";
110 cout << "Example screen.seqs(fasta=abrecovery.fasta, name=abrecovery.names, group=abrecovery.groups, start=..., end=..., maxambig=..., maxhomop=..., minlength=..., maxlength=...)." << "\n";
111 cout << "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta)." << "\n" << "\n";
114 catch(exception& e) {
115 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
119 cout << "An unknown error has occurred in the ScreenSeqsCommand class function help. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
124 //***************************************************************************************************************
126 ScreenSeqsCommand::~ScreenSeqsCommand(){ /* do nothing */ }
128 //***************************************************************************************************************
130 int ScreenSeqsCommand::execute(){
133 if (abort == true) { return 0; }
136 openInputFile(fastafile, inFASTA);
138 set<string> badSeqNames;
140 string goodSeqFile = getRootName(fastafile) + "good" + getExtension(fastafile);
141 string badSeqFile = getRootName(fastafile) + "bad" + getExtension(fastafile);
143 ofstream goodSeqOut; openOutputFile(goodSeqFile, goodSeqOut);
144 ofstream badSeqOut; openOutputFile(badSeqFile, badSeqOut);
146 while(!inFASTA.eof()){
147 Sequence currSeq(inFASTA);
148 bool goodSeq = 1; // innocent until proven guilty
149 if(goodSeq == 1 && startPos != -1 && startPos < currSeq.getStartPos()) { goodSeq = 0; }
150 if(goodSeq == 1 && endPos != -1 && endPos > currSeq.getEndPos()) { goodSeq = 0; }
151 if(goodSeq == 1 && maxAmbig != -1 && maxAmbig < currSeq.getAmbigBases()) { goodSeq = 0; }
152 if(goodSeq == 1 && maxHomoP != -1 && maxHomoP < currSeq.getLongHomoPolymer()) { goodSeq = 0; }
153 if(goodSeq == 1 && minLength != -1 && minLength > currSeq.getNumBases()) { goodSeq = 0; }
154 if(goodSeq == 1 && maxLength != -1 && maxLength < currSeq.getNumBases()) { goodSeq = 0; }
157 currSeq.printSequence(goodSeqOut);
160 currSeq.printSequence(badSeqOut);
161 badSeqNames.insert(currSeq.getName());
166 screenNameGroupFile(badSeqNames);
168 else if(groupfile != ""){
169 screenGroupFile(badSeqNames);
174 catch(exception& e) {
175 cout << "Standard Error: " << e.what() << " has occurred in the ScreenSeqsCommand class Function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
179 cout << "An unknown error has occurred in the ScreenSeqsCommand class function execute. Please contact Pat Schloss at pschloss@microbio.umass.edu." << "\n";
185 //***************************************************************************************************************
187 void ScreenSeqsCommand::screenNameGroupFile(set<string> badSeqNames){
190 openInputFile(globaldata->getNameFile(), inputNames);
191 set<string> badSeqGroups;
192 string seqName, seqList, group;
193 set<string>::iterator it;
195 string goodNameFile = getRootName(globaldata->getNameFile()) + "good" + getExtension(globaldata->getNameFile());
196 string badNameFile = getRootName(globaldata->getNameFile()) + "bad" + getExtension(globaldata->getNameFile());
198 ofstream goodNameOut; openOutputFile(goodNameFile, goodNameOut);
199 ofstream badNameOut; openOutputFile(badNameFile, badNameOut);
201 while(!inputNames.eof()){
202 inputNames >> seqName >> seqList;
203 it = badSeqNames.find(seqName);
205 if(it != badSeqNames.end()){
206 badSeqNames.erase(it);
207 badNameOut << seqName << '\t' << seqList << endl;
208 if(globaldata->getNameFile() != ""){
210 for(int i=0;i<seqList.length();i++){
211 if(seqList[i] == ','){
212 badSeqGroups.insert(seqList.substr(start,i-start));
216 badSeqGroups.insert(seqList.substr(start,seqList.length()-start));
220 goodNameOut << seqName << '\t' << seqList << endl;
228 if(globaldata->getGroupFile() != ""){
230 ifstream inputGroups;
231 openInputFile(globaldata->getGroupFile(), inputGroups);
233 string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
234 string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
236 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
237 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
239 while(!inputGroups.eof()){
240 inputGroups >> seqName >> group;
242 it = badSeqGroups.find(seqName);
244 if(it != badSeqGroups.end()){
245 badSeqGroups.erase(it);
246 badGroupOut << seqName << '\t' << group << endl;
249 goodGroupOut << seqName << '\t' << group << endl;
254 goodGroupOut.close();
259 //***************************************************************************************************************
261 void ScreenSeqsCommand::screenGroupFile(set<string> badSeqNames){
263 ifstream inputGroups;
264 openInputFile(globaldata->getGroupFile(), inputGroups);
265 string seqName, group;
266 set<string>::iterator it;
268 string goodGroupFile = getRootName(globaldata->getGroupFile()) + "good" + getExtension(globaldata->getGroupFile());
269 string badGroupFile = getRootName(globaldata->getGroupFile()) + "bad" + getExtension(globaldata->getGroupFile());
271 ofstream goodGroupOut; openOutputFile(goodGroupFile, goodGroupOut);
272 ofstream badGroupOut; openOutputFile(badGroupFile, badGroupOut);
274 while(!inputGroups.eof()){
275 inputGroups >> seqName >> group;
276 it = badSeqNames.find(seqName);
278 if(it != badSeqNames.end()){
279 badSeqNames.erase(it);
280 badGroupOut << seqName << '\t' << group << endl;
283 goodGroupOut << seqName << '\t' << group << endl;
288 goodGroupOut.close();
293 //***************************************************************************************************************