5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none",false,false); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none",false,false); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "PcrSeqsCommand", "setParameters");
39 //**********************************************************************************************************************
40 string PcrSeqsCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The pcr.seqs command reads a fasta file.\n";
44 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
45 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
46 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
47 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
48 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
49 helpString += "The processors parameter allows you to use multiple processors.\n";
50 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
51 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
53 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
57 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
62 //**********************************************************************************************************************
63 string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
65 string outputFileName = "";
66 map<string, vector<string> >::iterator it;
68 //is this a type this command creates
69 it = outputTypes.find(type);
70 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
72 if (type == "fasta") { outputFileName = "pcr.fasta"; }
73 else if (type == "taxonomy") { outputFileName = "pcr" + m->getExtension(inputName); }
74 else if (type == "group") { outputFileName = "pcr" + m->getExtension(inputName); }
75 else if (type == "name") { outputFileName = "pcr" + m->getExtension(inputName); }
76 else if (type == "count") { outputFileName = "pcr" + m->getExtension(inputName); }
77 else if (type == "accnos") { outputFileName = "bad.accnos"; }
78 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
80 return outputFileName;
83 m->errorOut(e, "PcrSeqsCommand", "getOutputFileNameTag");
87 //**********************************************************************************************************************
89 PcrSeqsCommand::PcrSeqsCommand(){
91 abort = true; calledHelp = true;
93 vector<string> tempOutNames;
94 outputTypes["fasta"] = tempOutNames;
95 outputTypes["taxonomy"] = tempOutNames;
96 outputTypes["group"] = tempOutNames;
97 outputTypes["name"] = tempOutNames;
98 outputTypes["count"] = tempOutNames;
99 outputTypes["accnos"] = tempOutNames;
101 catch(exception& e) {
102 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
106 //***************************************************************************************************************
108 PcrSeqsCommand::PcrSeqsCommand(string option) {
111 abort = false; calledHelp = false;
113 //allow user to run help
114 if(option == "help") { help(); abort = true; calledHelp = true; }
115 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
118 vector<string> myArray = setParameters();
120 OptionParser parser(option);
121 map<string,string> parameters = parser.getParameters();
123 ValidParameters validParameter;
124 map<string,string>::iterator it;
126 //check to make sure all parameters are valid for command
127 for (it = parameters.begin(); it != parameters.end(); it++) {
128 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
131 //initialize outputTypes
132 vector<string> tempOutNames;
133 outputTypes["fasta"] = tempOutNames;
134 outputTypes["taxonomy"] = tempOutNames;
135 outputTypes["group"] = tempOutNames;
136 outputTypes["name"] = tempOutNames;
137 outputTypes["accnos"] = tempOutNames;
138 outputTypes["count"] = tempOutNames;
140 //if the user changes the input directory command factory will send this info to us in the output parameter
141 string inputDir = validParameter.validFile(parameters, "inputdir", false);
142 if (inputDir == "not found"){ inputDir = ""; }
145 it = parameters.find("fasta");
146 //user has given a template file
147 if(it != parameters.end()){
148 path = m->hasPath(it->second);
149 //if the user has not given a path then, add inputdir. else leave path alone.
150 if (path == "") { parameters["fasta"] = inputDir + it->second; }
153 it = parameters.find("oligos");
154 //user has given a template file
155 if(it != parameters.end()){
156 path = m->hasPath(it->second);
157 //if the user has not given a path then, add inputdir. else leave path alone.
158 if (path == "") { parameters["oligos"] = inputDir + it->second; }
161 it = parameters.find("ecoli");
162 //user has given a template file
163 if(it != parameters.end()){
164 path = m->hasPath(it->second);
165 //if the user has not given a path then, add inputdir. else leave path alone.
166 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
169 it = parameters.find("taxonomy");
170 //user has given a template file
171 if(it != parameters.end()){
172 path = m->hasPath(it->second);
173 //if the user has not given a path then, add inputdir. else leave path alone.
174 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
177 it = parameters.find("name");
178 //user has given a template file
179 if(it != parameters.end()){
180 path = m->hasPath(it->second);
181 //if the user has not given a path then, add inputdir. else leave path alone.
182 if (path == "") { parameters["name"] = inputDir + it->second; }
185 it = parameters.find("group");
186 //user has given a template file
187 if(it != parameters.end()){
188 path = m->hasPath(it->second);
189 //if the user has not given a path then, add inputdir. else leave path alone.
190 if (path == "") { parameters["group"] = inputDir + it->second; }
193 it = parameters.find("count");
194 //user has given a template file
195 if(it != parameters.end()){
196 path = m->hasPath(it->second);
197 //if the user has not given a path then, add inputdir. else leave path alone.
198 if (path == "") { parameters["count"] = inputDir + it->second; }
204 //check for required parameters
205 fastafile = validParameter.validFile(parameters, "fasta", true);
206 if (fastafile == "not found") {
207 fastafile = m->getFastaFile();
208 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
209 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
210 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
211 else { m->setFastaFile(fastafile); }
213 //if the user changes the output directory command factory will send this info to us in the output parameter
214 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
216 //check for optional parameter and set defaults
217 // ...at some point should added some additional type checking...
219 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
220 keepprimer = m->isTrue(temp);
222 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
223 keepdots = m->isTrue(temp);
225 temp = validParameter.validFile(parameters, "oligos", true);
226 if (temp == "not found"){ oligosfile = ""; }
227 else if(temp == "not open"){ oligosfile = ""; abort = true; }
228 else { oligosfile = temp; m->setOligosFile(oligosfile); }
230 ecolifile = validParameter.validFile(parameters, "ecoli", true);
231 if (ecolifile == "not found"){ ecolifile = ""; }
232 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
234 namefile = validParameter.validFile(parameters, "name", true);
235 if (namefile == "not found"){ namefile = ""; }
236 else if(namefile == "not open"){ namefile = ""; abort = true; }
237 else { m->setNameFile(namefile); }
239 groupfile = validParameter.validFile(parameters, "group", true);
240 if (groupfile == "not found"){ groupfile = ""; }
241 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
242 else { m->setGroupFile(groupfile); }
244 countfile = validParameter.validFile(parameters, "count", true);
245 if (countfile == "not open") { countfile = ""; abort = true; }
246 else if (countfile == "not found") { countfile = ""; }
247 else { m->setCountTableFile(countfile); }
249 if ((namefile != "") && (countfile != "")) {
250 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
253 if ((groupfile != "") && (countfile != "")) {
254 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
257 taxfile = validParameter.validFile(parameters, "taxonomy", true);
258 if (taxfile == "not found"){ taxfile = ""; }
259 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
260 else { m->setTaxonomyFile(taxfile); }
262 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
263 m->mothurConvert(temp, start);
265 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
266 m->mothurConvert(temp, end);
268 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
269 m->setProcessors(temp);
270 m->mothurConvert(temp, processors);
272 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
274 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
277 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
278 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
281 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
283 if ((ecolifile != "") && (start != -1) && (end != -1)) {
284 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
288 if ((oligosfile != "") && (ecolifile != "")) {
289 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
292 //check to make sure you didn't forget the name file by mistake
293 if (countfile == "") {
294 if (namefile == "") {
295 vector<string> files; files.push_back(fastafile);
296 parser.getNameFile(files);
302 catch(exception& e) {
303 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
307 //***************************************************************************************************************
309 int PcrSeqsCommand::execute(){
312 if (abort == true) { if (calledHelp) { return 0; } return 2; }
314 int start = time(NULL);
316 string thisOutputDir = outputDir;
317 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
318 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta");
319 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
321 string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "scrap." + getOutputFileNameTag("fasta");
325 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
326 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
328 vector<unsigned long long> positions;
329 int numFastaSeqs = 0;
330 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
331 positions = m->divideFile(fastafile, processors);
332 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
334 if (processors == 1) {
335 lines.push_back(linePair(0, 1000));
337 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
338 if (positions.size() < processors) { processors = positions.size(); }
340 //figure out how many sequences you have to process
341 int numSeqsPerProcessor = numFastaSeqs / processors;
342 for (int i = 0; i < processors; i++) {
343 int startIndex = i * numSeqsPerProcessor;
344 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
345 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
349 if (m->control_pressed) { return 0; }
351 set<string> badNames;
352 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
353 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
355 if (m->control_pressed) { return 0; }
357 //don't write or keep if blank
358 if (badNames.size() != 0) { writeAccnos(badNames); }
359 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
360 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
362 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
363 if (namefile != "") { readName(badNames); }
364 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
365 if (groupfile != "") { readGroup(badNames); }
366 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
367 if (taxfile != "") { readTax(badNames); }
368 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
369 if (countfile != "") { readCount(badNames); }
370 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 m->mothurOutEndLine();
373 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
374 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
375 m->mothurOutEndLine();
376 m->mothurOutEndLine();
378 //set fasta file as new current fastafile
380 itTypes = outputTypes.find("fasta");
381 if (itTypes != outputTypes.end()) {
382 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
385 itTypes = outputTypes.find("name");
386 if (itTypes != outputTypes.end()) {
387 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
390 itTypes = outputTypes.find("group");
391 if (itTypes != outputTypes.end()) {
392 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
395 itTypes = outputTypes.find("accnos");
396 if (itTypes != outputTypes.end()) {
397 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
400 itTypes = outputTypes.find("taxonomy");
401 if (itTypes != outputTypes.end()) {
402 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
405 itTypes = outputTypes.find("count");
406 if (itTypes != outputTypes.end()) {
407 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
410 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
411 m->mothurOutEndLine();
417 catch(exception& e) {
418 m->errorOut(e, "PcrSeqsCommand", "execute");
422 /**************************************************************************************************/
423 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
426 vector<int> processIDS;
430 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
432 //loop through and create all the processes you want
433 while (process != processors) {
437 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
440 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
442 //pass numSeqs to parent
444 string tempFile = filename + toString(getpid()) + ".num.temp";
445 m->openOutputFile(tempFile, out);
446 out << num << '\t' << badSeqNames.size() << endl;
447 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
448 out << (*it) << endl;
454 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
455 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
460 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
462 //force parent to wait until all the processes are done
463 for (int i=0;i<processIDS.size();i++) {
464 int temp = processIDS[i];
468 for (int i = 0; i < processIDS.size(); i++) {
470 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
471 m->openInputFile(tempFile, in);
472 int numBadNames = 0; string name = "";
473 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
474 for (int j = 0; j < numBadNames; j++) {
475 in >> name; m->gobble(in);
476 badSeqNames.insert(name);
478 in.close(); m->mothurRemove(tempFile);
480 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
481 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
483 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
484 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
488 //////////////////////////////////////////////////////////////////////////////////////////////////////
489 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
490 //Above fork() will clone, so memory is separate, but that's not the case with windows,
491 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
492 //////////////////////////////////////////////////////////////////////////////////////////////////////
494 vector<pcrData*> pDataArray;
495 DWORD dwThreadIdArray[processors-1];
496 HANDLE hThreadArray[processors-1];
498 //Create processor worker threads.
499 for( int i=0; i<processors-1; i++ ){
501 string extension = "";
502 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
504 // Allocate memory for thread data.
505 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
506 pDataArray.push_back(tempPcr);
508 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
509 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
513 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
514 processIDS.push_back(processors-1);
516 //Wait until all threads have terminated.
517 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
519 //Close all thread handles and free memory allocations.
520 for(int i=0; i < pDataArray.size(); i++){
521 num += pDataArray[i]->count;
522 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
523 CloseHandle(hThreadArray[i]);
524 delete pDataArray[i];
527 for (int i = 0; i < processIDS.size(); i++) {
528 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
529 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
531 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
532 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
540 catch(exception& e) {
541 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
546 //**********************************************************************************************************************
547 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
550 m->openOutputFile(goodFasta, goodFile);
553 m->openOutputFile(badFasta, badFile);
556 m->openInputFile(filename, inFASTA);
558 inFASTA.seekg(filePos.start);
566 if (m->control_pressed) { break; }
568 Sequence currSeq(inFASTA); m->gobble(inFASTA);
570 string trashCode = "";
571 if (currSeq.getName() != "") {
574 if (oligosfile != "") {
575 map<int, int> mapAligned;
576 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
579 if (primers.size() != 0) {
580 int primerStart = 0; int primerEnd = 0;
581 bool good = findForward(currSeq, primerStart, primerEnd);
583 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
588 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
589 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
592 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
593 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
596 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
597 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
602 //process reverse primers
603 if (revPrimer.size() != 0) {
604 int primerStart = 0; int primerEnd = 0;
605 bool good = findReverse(currSeq, primerStart, primerEnd);
606 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
611 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
612 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
615 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
616 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
620 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
621 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
625 }else if (ecolifile != "") {
626 //make sure the seqs are aligned
627 lengths.insert(currSeq.getAligned().length());
628 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
629 else if (currSeq.getAligned().length() != length) {
630 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
633 currSeq.filterToPos(start);
634 currSeq.filterFromPos(end);
636 string seqString = currSeq.getAligned().substr(0, end);
637 seqString = seqString.substr(start);
638 currSeq.setAligned(seqString);
641 }else{ //using start and end to trim
642 //make sure the seqs are aligned
643 lengths.insert(currSeq.getAligned().length());
644 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
647 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
649 if (keepdots) { currSeq.filterFromPos(end); }
651 string seqString = currSeq.getAligned().substr(0, end);
652 currSeq.setAligned(seqString);
657 if (keepdots) { currSeq.filterToPos(start); }
659 string seqString = currSeq.getAligned().substr(start);
660 currSeq.setAligned(seqString);
666 //trimming removed all bases
667 if (currSeq.getUnaligned() == "") { goodSeq = false; }
669 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
671 badSeqNames.insert(currSeq.getName());
672 currSeq.setName(currSeq.getName() + '|' + trashCode);
673 currSeq.printSequence(badFile);
678 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
679 unsigned long long pos = inFASTA.tellg();
680 if ((pos == -1) || (pos >= filePos.end)) { break; }
682 if (inFASTA.eof()) { break; }
686 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
689 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
697 catch(exception& e) {
698 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
702 //********************************************************************/
703 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
706 string rawSequence = seq.getUnaligned();
708 for(int j=0;j<primers.size();j++){
709 string oligo = primers[j];
711 if(rawSequence.length() < oligo.length()) { break; }
714 int olength = oligo.length();
715 for (int j = 0; j < rawSequence.length()-olength; j++){
716 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
717 string rawChunk = rawSequence.substr(j, olength);
718 if(compareDNASeq(oligo, rawChunk)) {
720 primerEnd = primerStart + olength;
727 primerStart = 0; primerEnd = 0;
731 catch(exception& e) {
732 m->errorOut(e, "TrimOligos", "stripForward");
736 //******************************************************************/
737 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
740 string rawSequence = seq.getUnaligned();
742 for(int i=0;i<revPrimer.size();i++){
743 string oligo = revPrimer[i];
744 if(rawSequence.length() < oligo.length()) { break; }
747 int olength = oligo.length();
748 for (int j = rawSequence.length()-olength; j >= 0; j--){
749 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
750 string rawChunk = rawSequence.substr(j, olength);
752 if(compareDNASeq(oligo, rawChunk)) {
754 primerEnd = primerStart + olength;
761 primerStart = 0; primerEnd = 0;
764 catch(exception& e) {
765 m->errorOut(e, "PcrSeqsCommand", "findReverse");
769 //********************************************************************/
770 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
772 bool isAligned = false;
775 for (int i = 0; i < seq.length(); i++) {
776 if (!isalpha(seq[i])) { isAligned = true; }
777 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
778 } //ie. the 3rd base may be at spot 10 in the alignment
779 //later when we trim we want to trim from spot 10.
782 catch(exception& e) {
783 m->errorOut(e, "PcrSeqsCommand", "isAligned");
787 //********************************************************************/
788 string PcrSeqsCommand::reverseOligo(string oligo){
792 for(int i=oligo.length()-1;i>=0;i--){
794 if(oligo[i] == 'A') { reverse += 'T'; }
795 else if(oligo[i] == 'T'){ reverse += 'A'; }
796 else if(oligo[i] == 'U'){ reverse += 'A'; }
798 else if(oligo[i] == 'G'){ reverse += 'C'; }
799 else if(oligo[i] == 'C'){ reverse += 'G'; }
801 else if(oligo[i] == 'R'){ reverse += 'Y'; }
802 else if(oligo[i] == 'Y'){ reverse += 'R'; }
804 else if(oligo[i] == 'M'){ reverse += 'K'; }
805 else if(oligo[i] == 'K'){ reverse += 'M'; }
807 else if(oligo[i] == 'W'){ reverse += 'W'; }
808 else if(oligo[i] == 'S'){ reverse += 'S'; }
810 else if(oligo[i] == 'B'){ reverse += 'V'; }
811 else if(oligo[i] == 'V'){ reverse += 'B'; }
813 else if(oligo[i] == 'D'){ reverse += 'H'; }
814 else if(oligo[i] == 'H'){ reverse += 'D'; }
816 else { reverse += 'N'; }
822 catch(exception& e) {
823 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
828 //***************************************************************************************************************
829 bool PcrSeqsCommand::readOligos(){
832 m->openInputFile(oligosfile, inOligos);
834 string type, oligo, group;
836 while(!inOligos.eof()){
840 if(type[0] == '#'){ //ignore
841 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
845 //make type case insensitive
846 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
850 for(int i=0;i<oligo.length();i++){
851 oligo[i] = toupper(oligo[i]);
852 if(oligo[i] == 'U') { oligo[i] = 'T'; }
855 if(type == "FORWARD"){
856 // get rest of line in case there is a primer name
857 while (!inOligos.eof()) {
858 char c = inOligos.get();
859 if (c == 10 || c == 13){ break; }
860 else if (c == 32 || c == 9){;} //space or tab
862 primers.push_back(oligo);
863 }else if(type == "REVERSE"){
864 string oligoRC = reverseOligo(oligo);
865 revPrimer.push_back(oligoRC);
866 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
867 }else if(type == "BARCODE"){
869 }else if((type == "LINKER")||(type == "SPACER")) {;}
870 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
876 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
877 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
878 m->control_pressed = true;
884 }catch(exception& e) {
885 m->errorOut(e, "PcrSeqsCommand", "readOligos");
889 //***************************************************************************************************************
890 bool PcrSeqsCommand::readEcoli(){
893 m->openInputFile(ecolifile, in);
898 length = ecoli.getAligned().length();
899 start = ecoli.getStartPos();
900 end = ecoli.getEndPos();
901 }else { in.close(); m->control_pressed = true; return false; }
906 catch(exception& e) {
907 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
912 //***************************************************************************************************************
913 int PcrSeqsCommand::writeAccnos(set<string> badNames){
915 string thisOutputDir = outputDir;
916 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
917 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
918 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
921 m->openOutputFile(outputFileName, out);
923 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
924 if (m->control_pressed) { break; }
925 out << (*it) << endl;
931 catch(exception& e) {
932 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
937 //******************************************************************/
938 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
941 int length = oligo.length();
943 for(int i=0;i<length;i++){
945 if(oligo[i] != seq[i]){
946 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
947 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
948 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
949 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
950 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
951 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
952 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
953 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
954 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
955 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
956 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
957 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
959 if(success == 0) { break; }
968 catch(exception& e) {
969 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
974 //***************************************************************************************************************
975 int PcrSeqsCommand::readName(set<string>& names){
977 string thisOutputDir = outputDir;
978 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
979 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
982 m->openOutputFile(outputFileName, out);
985 m->openInputFile(namefile, in);
986 string name, firstCol, secondCol;
988 bool wroteSomething = false;
989 int removedCount = 0;
992 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
994 in >> firstCol; m->gobble(in);
997 string savedSecond = secondCol;
998 vector<string> parsedNames;
999 m->splitAtComma(secondCol, parsedNames);
1001 vector<string> validSecond; validSecond.clear();
1002 for (int i = 0; i < parsedNames.size(); i++) {
1003 if (names.count(parsedNames[i]) == 0) {
1004 validSecond.push_back(parsedNames[i]);
1008 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1009 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1010 removedCount += parsedNames.size();
1012 out << firstCol << '\t' << savedSecond << endl;
1013 wroteSomething = true;
1020 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1021 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1023 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1027 catch(exception& e) {
1028 m->errorOut(e, "PcrSeqsCommand", "readName");
1032 //**********************************************************************************************************************
1033 int PcrSeqsCommand::readGroup(set<string> names){
1035 string thisOutputDir = outputDir;
1036 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1037 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
1040 m->openOutputFile(outputFileName, out);
1043 m->openInputFile(groupfile, in);
1046 bool wroteSomething = false;
1047 int removedCount = 0;
1050 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1052 in >> name; //read from first column
1053 in >> group; //read from second column
1055 //if this name is in the accnos file
1056 if (names.count(name) == 0) {
1057 wroteSomething = true;
1058 out << name << '\t' << group << endl;
1059 }else { removedCount++; }
1066 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1067 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1069 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1074 catch(exception& e) {
1075 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1079 //**********************************************************************************************************************
1080 int PcrSeqsCommand::readTax(set<string> names){
1082 string thisOutputDir = outputDir;
1083 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1084 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile);
1086 m->openOutputFile(outputFileName, out);
1089 m->openInputFile(taxfile, in);
1092 bool wroteSomething = false;
1093 int removedCount = 0;
1096 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1098 in >> name; //read from first column
1099 in >> tax; //read from second column
1101 //if this name is in the accnos file
1102 if (names.count(name) == 0) {
1103 wroteSomething = true;
1104 out << name << '\t' << tax << endl;
1105 }else { removedCount++; }
1112 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1113 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1115 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1119 catch(exception& e) {
1120 m->errorOut(e, "PcrSeqsCommand", "readTax");
1124 //***************************************************************************************************************
1125 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1128 m->openInputFile(countfile, in);
1129 set<string>::iterator it;
1131 string goodCountFile = outputDir + m->getRootName(m->getSimpleName(countfile)) + getOutputFileNameTag("count", countfile);
1132 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1133 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1135 string headers = m->getline(in); m->gobble(in);
1136 goodCountOut << headers << endl;
1138 string name, rest; int thisTotal, removedCount; removedCount = 0;
1139 bool wroteSomething = false;
1142 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1144 in >> name; m->gobble(in);
1145 in >> thisTotal; m->gobble(in);
1146 rest = m->getline(in); m->gobble(in);
1148 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1150 wroteSomething = true;
1151 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1155 goodCountOut.close();
1157 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1159 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1161 //check for groups that have been eliminated
1163 if (ct.testGroups(goodCountFile)) {
1164 ct.readTable(goodCountFile);
1165 ct.printTable(goodCountFile);
1168 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1170 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1176 catch(exception& e) {
1177 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1181 /**************************************************************************************/