5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
28 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "PcrSeqsCommand", "setParameters");
41 //**********************************************************************************************************************
42 string PcrSeqsCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The pcr.seqs command reads a fasta file.\n";
46 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n";
47 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
48 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
49 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
50 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
51 helpString += "The processors parameter allows you to use multiple processors.\n";
52 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
53 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
54 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
55 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
56 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
60 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string PcrSeqsCommand::getOutputPattern(string type) {
69 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
70 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
72 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
73 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
74 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
80 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
84 //**********************************************************************************************************************
86 PcrSeqsCommand::PcrSeqsCommand(){
88 abort = true; calledHelp = true;
90 vector<string> tempOutNames;
91 outputTypes["fasta"] = tempOutNames;
92 outputTypes["taxonomy"] = tempOutNames;
93 outputTypes["group"] = tempOutNames;
94 outputTypes["name"] = tempOutNames;
95 outputTypes["count"] = tempOutNames;
96 outputTypes["accnos"] = tempOutNames;
99 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
103 //***************************************************************************************************************
105 PcrSeqsCommand::PcrSeqsCommand(string option) {
108 abort = false; calledHelp = false;
110 //allow user to run help
111 if(option == "help") { help(); abort = true; calledHelp = true; }
112 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string,string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string,string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["fasta"] = tempOutNames;
131 outputTypes["taxonomy"] = tempOutNames;
132 outputTypes["group"] = tempOutNames;
133 outputTypes["name"] = tempOutNames;
134 outputTypes["accnos"] = tempOutNames;
135 outputTypes["count"] = tempOutNames;
137 //if the user changes the input directory command factory will send this info to us in the output parameter
138 string inputDir = validParameter.validFile(parameters, "inputdir", false);
139 if (inputDir == "not found"){ inputDir = ""; }
142 it = parameters.find("fasta");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["fasta"] = inputDir + it->second; }
150 it = parameters.find("oligos");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["oligos"] = inputDir + it->second; }
158 it = parameters.find("ecoli");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
166 it = parameters.find("taxonomy");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
182 it = parameters.find("group");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["group"] = inputDir + it->second; }
190 it = parameters.find("count");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["count"] = inputDir + it->second; }
201 //check for required parameters
202 fastafile = validParameter.validFile(parameters, "fasta", true);
203 if (fastafile == "not found") {
204 fastafile = m->getFastaFile();
205 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
206 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
207 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
208 else { m->setFastaFile(fastafile); }
210 //if the user changes the output directory command factory will send this info to us in the output parameter
211 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
213 //check for optional parameter and set defaults
214 // ...at some point should added some additional type checking...
216 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
217 keepprimer = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
220 keepdots = m->isTrue(temp);
222 temp = validParameter.validFile(parameters, "oligos", true);
223 if (temp == "not found"){ oligosfile = ""; }
224 else if(temp == "not open"){ oligosfile = ""; abort = true; }
225 else { oligosfile = temp; m->setOligosFile(oligosfile); }
227 ecolifile = validParameter.validFile(parameters, "ecoli", true);
228 if (ecolifile == "not found"){ ecolifile = ""; }
229 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
231 namefile = validParameter.validFile(parameters, "name", true);
232 if (namefile == "not found"){ namefile = ""; }
233 else if(namefile == "not open"){ namefile = ""; abort = true; }
234 else { m->setNameFile(namefile); }
236 groupfile = validParameter.validFile(parameters, "group", true);
237 if (groupfile == "not found"){ groupfile = ""; }
238 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
239 else { m->setGroupFile(groupfile); }
241 countfile = validParameter.validFile(parameters, "count", true);
242 if (countfile == "not open") { countfile = ""; abort = true; }
243 else if (countfile == "not found") { countfile = ""; }
244 else { m->setCountTableFile(countfile); }
246 if ((namefile != "") && (countfile != "")) {
247 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
250 if ((groupfile != "") && (countfile != "")) {
251 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
254 taxfile = validParameter.validFile(parameters, "taxonomy", true);
255 if (taxfile == "not found"){ taxfile = ""; }
256 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
257 else { m->setTaxonomyFile(taxfile); }
259 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, start);
262 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
263 m->mothurConvert(temp, end);
265 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
266 m->setProcessors(temp);
267 m->mothurConvert(temp, processors);
269 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, pdiffs);
272 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
274 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
277 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
278 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
281 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
283 if ((ecolifile != "") && (start != -1) && (end != -1)) {
284 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
288 if ((oligosfile != "") && (ecolifile != "")) {
289 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
292 //check to make sure you didn't forget the name file by mistake
293 if (countfile == "") {
294 if (namefile == "") {
295 vector<string> files; files.push_back(fastafile);
296 parser.getNameFile(files);
302 catch(exception& e) {
303 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
307 //***************************************************************************************************************
309 int PcrSeqsCommand::execute(){
312 if (abort == true) { if (calledHelp) { return 0; } return 2; }
314 int start = time(NULL);
316 string thisOutputDir = outputDir;
317 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
318 map<string, string> variables;
319 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
320 variables["[extension]"] = m->getExtension(fastafile);
321 string trimSeqFile = getOutputFileName("fasta",variables);
322 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
323 variables["[tag]"] = "scrap";
324 string badSeqFile = getOutputFileName("fasta",variables);
328 if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; }
329 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
331 vector<unsigned long long> positions;
332 int numFastaSeqs = 0;
333 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
334 positions = m->divideFile(fastafile, processors);
335 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
337 if (processors == 1) {
338 lines.push_back(linePair(0, 1000));
340 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
341 if (positions.size() < processors) { processors = positions.size(); }
343 //figure out how many sequences you have to process
344 int numSeqsPerProcessor = numFastaSeqs / processors;
345 for (int i = 0; i < processors; i++) {
346 int startIndex = i * numSeqsPerProcessor;
347 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
348 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
352 if (m->control_pressed) { return 0; }
354 set<string> badNames;
355 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
356 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
358 if (m->control_pressed) { return 0; }
360 //don't write or keep if blank
361 if (badNames.size() != 0) { writeAccnos(badNames); }
362 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
363 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (namefile != "") { readName(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
368 if (groupfile != "") { readGroup(badNames); }
369 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
370 if (taxfile != "") { readTax(badNames); }
371 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 if (countfile != "") { readCount(badNames); }
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 m->mothurOutEndLine();
376 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
377 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
378 m->mothurOutEndLine();
379 m->mothurOutEndLine();
381 //set fasta file as new current fastafile
383 itTypes = outputTypes.find("fasta");
384 if (itTypes != outputTypes.end()) {
385 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
388 itTypes = outputTypes.find("name");
389 if (itTypes != outputTypes.end()) {
390 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
393 itTypes = outputTypes.find("group");
394 if (itTypes != outputTypes.end()) {
395 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
398 itTypes = outputTypes.find("accnos");
399 if (itTypes != outputTypes.end()) {
400 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
403 itTypes = outputTypes.find("taxonomy");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
408 itTypes = outputTypes.find("count");
409 if (itTypes != outputTypes.end()) {
410 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
413 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
414 m->mothurOutEndLine();
420 catch(exception& e) {
421 m->errorOut(e, "PcrSeqsCommand", "execute");
425 /**************************************************************************************************/
426 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
429 vector<int> processIDS;
433 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
435 //loop through and create all the processes you want
436 while (process != processors) {
440 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
443 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
445 //pass numSeqs to parent
447 string tempFile = filename + toString(getpid()) + ".num.temp";
448 m->openOutputFile(tempFile, out);
449 out << num << '\t' << badSeqNames.size() << endl;
450 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
451 out << (*it) << endl;
457 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
458 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
463 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
465 //force parent to wait until all the processes are done
466 for (int i=0;i<processIDS.size();i++) {
467 int temp = processIDS[i];
471 for (int i = 0; i < processIDS.size(); i++) {
473 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
474 m->openInputFile(tempFile, in);
475 int numBadNames = 0; string name = "";
476 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
477 for (int j = 0; j < numBadNames; j++) {
478 in >> name; m->gobble(in);
479 badSeqNames.insert(name);
481 in.close(); m->mothurRemove(tempFile);
483 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
484 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
486 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
487 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
491 //////////////////////////////////////////////////////////////////////////////////////////////////////
492 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
493 //Above fork() will clone, so memory is separate, but that's not the case with windows,
494 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
495 //////////////////////////////////////////////////////////////////////////////////////////////////////
497 vector<pcrData*> pDataArray;
498 DWORD dwThreadIdArray[processors-1];
499 HANDLE hThreadArray[processors-1];
501 //Create processor worker threads.
502 for( int i=0; i<processors-1; i++ ){
504 string extension = "";
505 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
507 // Allocate memory for thread data.
508 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end);
509 pDataArray.push_back(tempPcr);
511 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
512 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
516 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
517 processIDS.push_back(processors-1);
519 //Wait until all threads have terminated.
520 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
522 //Close all thread handles and free memory allocations.
523 for(int i=0; i < pDataArray.size(); i++){
524 num += pDataArray[i]->count;
525 if (pDataArray[i]->count != pDataArray[i]->fend) {
526 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
528 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
529 CloseHandle(hThreadArray[i]);
530 delete pDataArray[i];
533 for (int i = 0; i < processIDS.size(); i++) {
534 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
535 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
537 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
538 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
546 catch(exception& e) {
547 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
552 //**********************************************************************************************************************
553 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
556 m->openOutputFile(goodFasta, goodFile);
559 m->openOutputFile(badFasta, badFile);
562 m->openInputFile(filename, inFASTA);
564 inFASTA.seekg(filePos.start);
570 //pdiffs, bdiffs, primers, barcodes, revPrimers
571 map<string, int> faked;
572 TrimOligos trim(pdiffs, 0, primers, faked, revPrimer);
576 if (m->control_pressed) { break; }
578 Sequence currSeq(inFASTA); m->gobble(inFASTA);
580 string trashCode = "";
581 if (currSeq.getName() != "") {
583 if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); }
586 if (oligosfile != "") {
587 map<int, int> mapAligned;
588 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
591 if (primers.size() != 0) {
592 int primerStart = 0; int primerEnd = 0;
593 bool good = trim.findForward(currSeq, primerStart, primerEnd);
595 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
600 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
601 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
604 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
605 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
607 isAligned(currSeq.getAligned(), mapAligned);
609 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
610 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
615 //process reverse primers
616 if (revPrimer.size() != 0) {
617 int primerStart = 0; int primerEnd = 0;
618 bool good = trim.findReverse(currSeq, primerStart, primerEnd);
619 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
624 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
625 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
628 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
629 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
633 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
634 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
638 }else if (ecolifile != "") {
639 //make sure the seqs are aligned
640 lengths.insert(currSeq.getAligned().length());
641 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
642 else if (currSeq.getAligned().length() != length) {
643 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
646 currSeq.filterToPos(start);
647 currSeq.filterFromPos(end);
649 string seqString = currSeq.getAligned().substr(0, end);
650 seqString = seqString.substr(start);
651 currSeq.setAligned(seqString);
654 }else{ //using start and end to trim
655 //make sure the seqs are aligned
656 lengths.insert(currSeq.getAligned().length());
657 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
660 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
662 if (keepdots) { currSeq.filterFromPos(end); }
664 string seqString = currSeq.getAligned().substr(0, end);
665 currSeq.setAligned(seqString);
670 if (keepdots) { currSeq.filterToPos(start); }
672 string seqString = currSeq.getAligned().substr(start);
673 currSeq.setAligned(seqString);
679 //trimming removed all bases
680 if (currSeq.getUnaligned() == "") { goodSeq = false; }
682 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
684 badSeqNames.insert(currSeq.getName());
685 currSeq.setName(currSeq.getName() + '|' + trashCode);
686 currSeq.printSequence(badFile);
691 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
692 unsigned long long pos = inFASTA.tellg();
693 if ((pos == -1) || (pos >= filePos.end)) { break; }
695 if (inFASTA.eof()) { break; }
699 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
702 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
710 catch(exception& e) {
711 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
715 //********************************************************************/
716 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
719 bool isAligned = false;
722 for (int i = 0; i < seq.length(); i++) {
723 if (!isalpha(seq[i])) { isAligned = true; }
724 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
725 } //ie. the 3rd base may be at spot 10 in the alignment
726 //later when we trim we want to trim from spot 10.
729 catch(exception& e) {
730 m->errorOut(e, "PcrSeqsCommand", "isAligned");
734 //********************************************************************/
735 string PcrSeqsCommand::reverseOligo(string oligo){
739 for(int i=oligo.length()-1;i>=0;i--){
741 if(oligo[i] == 'A') { reverse += 'T'; }
742 else if(oligo[i] == 'T'){ reverse += 'A'; }
743 else if(oligo[i] == 'U'){ reverse += 'A'; }
745 else if(oligo[i] == 'G'){ reverse += 'C'; }
746 else if(oligo[i] == 'C'){ reverse += 'G'; }
748 else if(oligo[i] == 'R'){ reverse += 'Y'; }
749 else if(oligo[i] == 'Y'){ reverse += 'R'; }
751 else if(oligo[i] == 'M'){ reverse += 'K'; }
752 else if(oligo[i] == 'K'){ reverse += 'M'; }
754 else if(oligo[i] == 'W'){ reverse += 'W'; }
755 else if(oligo[i] == 'S'){ reverse += 'S'; }
757 else if(oligo[i] == 'B'){ reverse += 'V'; }
758 else if(oligo[i] == 'V'){ reverse += 'B'; }
760 else if(oligo[i] == 'D'){ reverse += 'H'; }
761 else if(oligo[i] == 'H'){ reverse += 'D'; }
763 else { reverse += 'N'; }
769 catch(exception& e) {
770 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
775 //***************************************************************************************************************
776 bool PcrSeqsCommand::readOligos(){
779 m->openInputFile(oligosfile, inOligos);
781 string type, oligo, group;
784 while(!inOligos.eof()){
788 if(type[0] == '#'){ //ignore
789 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
793 //make type case insensitive
794 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
798 for(int i=0;i<oligo.length();i++){
799 oligo[i] = toupper(oligo[i]);
800 if(oligo[i] == 'U') { oligo[i] = 'T'; }
803 if(type == "FORWARD"){
804 // get rest of line in case there is a primer name
805 while (!inOligos.eof()) {
806 char c = inOligos.get();
807 if (c == 10 || c == 13 || c == -1){ break; }
808 else if (c == 32 || c == 9){;} //space or tab
810 primers[oligo] = primerCount; primerCount++;
811 }else if(type == "REVERSE"){
812 string oligoRC = reverseOligo(oligo);
813 revPrimer.push_back(oligoRC);
814 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
815 }else if(type == "BARCODE"){
817 }else if((type == "LINKER")||(type == "SPACER")) {;}
818 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
824 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
825 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
826 m->control_pressed = true;
832 }catch(exception& e) {
833 m->errorOut(e, "PcrSeqsCommand", "readOligos");
837 //***************************************************************************************************************
838 bool PcrSeqsCommand::readEcoli(){
841 m->openInputFile(ecolifile, in);
846 length = ecoli.getAligned().length();
847 start = ecoli.getStartPos();
848 end = ecoli.getEndPos();
849 }else { in.close(); m->control_pressed = true; return false; }
854 catch(exception& e) {
855 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
860 //***************************************************************************************************************
861 int PcrSeqsCommand::writeAccnos(set<string> badNames){
863 string thisOutputDir = outputDir;
864 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
865 map<string, string> variables;
866 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
867 string outputFileName = getOutputFileName("accnos",variables);
868 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
871 m->openOutputFile(outputFileName, out);
873 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
874 if (m->control_pressed) { break; }
875 out << (*it) << endl;
881 catch(exception& e) {
882 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
887 //***************************************************************************************************************
888 int PcrSeqsCommand::readName(set<string>& names){
890 string thisOutputDir = outputDir;
891 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
892 map<string, string> variables;
893 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
894 variables["[extension]"] = m->getExtension(namefile);
895 string outputFileName = getOutputFileName("name", variables);
898 m->openOutputFile(outputFileName, out);
901 m->openInputFile(namefile, in);
902 string name, firstCol, secondCol;
904 bool wroteSomething = false;
905 int removedCount = 0;
908 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
910 in >> firstCol; m->gobble(in);
913 string savedSecond = secondCol;
914 vector<string> parsedNames;
915 m->splitAtComma(secondCol, parsedNames);
917 vector<string> validSecond; validSecond.clear();
918 for (int i = 0; i < parsedNames.size(); i++) {
919 if (names.count(parsedNames[i]) == 0) {
920 validSecond.push_back(parsedNames[i]);
924 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
925 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
926 removedCount += parsedNames.size();
928 out << firstCol << '\t' << savedSecond << endl;
929 wroteSomething = true;
936 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
937 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
939 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
943 catch(exception& e) {
944 m->errorOut(e, "PcrSeqsCommand", "readName");
948 //**********************************************************************************************************************
949 int PcrSeqsCommand::readGroup(set<string> names){
951 string thisOutputDir = outputDir;
952 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
953 map<string, string> variables;
954 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
955 variables["[extension]"] = m->getExtension(groupfile);
956 string outputFileName = getOutputFileName("group", variables);
959 m->openOutputFile(outputFileName, out);
962 m->openInputFile(groupfile, in);
965 bool wroteSomething = false;
966 int removedCount = 0;
969 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
971 in >> name; //read from first column
972 in >> group; //read from second column
974 //if this name is in the accnos file
975 if (names.count(name) == 0) {
976 wroteSomething = true;
977 out << name << '\t' << group << endl;
978 }else { removedCount++; }
985 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
986 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
988 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
993 catch(exception& e) {
994 m->errorOut(e, "PcrSeqsCommand", "readGroup");
998 //**********************************************************************************************************************
999 int PcrSeqsCommand::readTax(set<string> names){
1001 string thisOutputDir = outputDir;
1002 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1003 map<string, string> variables;
1004 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1005 variables["[extension]"] = m->getExtension(taxfile);
1006 string outputFileName = getOutputFileName("taxonomy", variables);
1009 m->openOutputFile(outputFileName, out);
1012 m->openInputFile(taxfile, in);
1015 bool wroteSomething = false;
1016 int removedCount = 0;
1019 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1021 in >> name; //read from first column
1022 in >> tax; //read from second column
1024 //if this name is in the accnos file
1025 if (names.count(name) == 0) {
1026 wroteSomething = true;
1027 out << name << '\t' << tax << endl;
1028 }else { removedCount++; }
1035 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1036 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1038 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1042 catch(exception& e) {
1043 m->errorOut(e, "PcrSeqsCommand", "readTax");
1047 //***************************************************************************************************************
1048 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1051 m->openInputFile(countfile, in);
1052 set<string>::iterator it;
1054 map<string, string> variables;
1055 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1056 variables["[extension]"] = m->getExtension(countfile);
1057 string goodCountFile = getOutputFileName("count", variables);
1059 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1060 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1062 string headers = m->getline(in); m->gobble(in);
1063 goodCountOut << headers << endl;
1065 string name, rest; int thisTotal, removedCount; removedCount = 0;
1066 bool wroteSomething = false;
1069 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1071 in >> name; m->gobble(in);
1072 in >> thisTotal; m->gobble(in);
1073 rest = m->getline(in); m->gobble(in);
1075 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1077 wroteSomething = true;
1078 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1082 goodCountOut.close();
1084 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1086 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1088 //check for groups that have been eliminated
1090 if (ct.testGroups(goodCountFile)) {
1091 ct.readTable(goodCountFile);
1092 ct.printTable(goodCountFile);
1095 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1097 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1103 catch(exception& e) {
1104 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1108 /**************************************************************************************/