5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "PcrSeqsCommand", "setParameters");
39 //**********************************************************************************************************************
40 string PcrSeqsCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The pcr.seqs command reads a fasta file.\n";
44 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
45 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
46 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
47 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
48 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
49 helpString += "The processors parameter allows you to use multiple processors.\n";
50 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
51 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
53 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
57 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
61 //**********************************************************************************************************************
62 string PcrSeqsCommand::getOutputPattern(string type) {
66 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
67 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
68 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
69 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
70 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
72 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
77 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
81 //**********************************************************************************************************************
83 PcrSeqsCommand::PcrSeqsCommand(){
85 abort = true; calledHelp = true;
87 vector<string> tempOutNames;
88 outputTypes["fasta"] = tempOutNames;
89 outputTypes["taxonomy"] = tempOutNames;
90 outputTypes["group"] = tempOutNames;
91 outputTypes["name"] = tempOutNames;
92 outputTypes["count"] = tempOutNames;
93 outputTypes["accnos"] = tempOutNames;
96 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
100 //***************************************************************************************************************
102 PcrSeqsCommand::PcrSeqsCommand(string option) {
105 abort = false; calledHelp = false;
107 //allow user to run help
108 if(option == "help") { help(); abort = true; calledHelp = true; }
109 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
112 vector<string> myArray = setParameters();
114 OptionParser parser(option);
115 map<string,string> parameters = parser.getParameters();
117 ValidParameters validParameter;
118 map<string,string>::iterator it;
120 //check to make sure all parameters are valid for command
121 for (it = parameters.begin(); it != parameters.end(); it++) {
122 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
125 //initialize outputTypes
126 vector<string> tempOutNames;
127 outputTypes["fasta"] = tempOutNames;
128 outputTypes["taxonomy"] = tempOutNames;
129 outputTypes["group"] = tempOutNames;
130 outputTypes["name"] = tempOutNames;
131 outputTypes["accnos"] = tempOutNames;
132 outputTypes["count"] = tempOutNames;
134 //if the user changes the input directory command factory will send this info to us in the output parameter
135 string inputDir = validParameter.validFile(parameters, "inputdir", false);
136 if (inputDir == "not found"){ inputDir = ""; }
139 it = parameters.find("fasta");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["fasta"] = inputDir + it->second; }
147 it = parameters.find("oligos");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["oligos"] = inputDir + it->second; }
155 it = parameters.find("ecoli");
156 //user has given a template file
157 if(it != parameters.end()){
158 path = m->hasPath(it->second);
159 //if the user has not given a path then, add inputdir. else leave path alone.
160 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
163 it = parameters.find("taxonomy");
164 //user has given a template file
165 if(it != parameters.end()){
166 path = m->hasPath(it->second);
167 //if the user has not given a path then, add inputdir. else leave path alone.
168 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
171 it = parameters.find("name");
172 //user has given a template file
173 if(it != parameters.end()){
174 path = m->hasPath(it->second);
175 //if the user has not given a path then, add inputdir. else leave path alone.
176 if (path == "") { parameters["name"] = inputDir + it->second; }
179 it = parameters.find("group");
180 //user has given a template file
181 if(it != parameters.end()){
182 path = m->hasPath(it->second);
183 //if the user has not given a path then, add inputdir. else leave path alone.
184 if (path == "") { parameters["group"] = inputDir + it->second; }
187 it = parameters.find("count");
188 //user has given a template file
189 if(it != parameters.end()){
190 path = m->hasPath(it->second);
191 //if the user has not given a path then, add inputdir. else leave path alone.
192 if (path == "") { parameters["count"] = inputDir + it->second; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") {
201 fastafile = m->getFastaFile();
202 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
205 else { m->setFastaFile(fastafile); }
207 //if the user changes the output directory command factory will send this info to us in the output parameter
208 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
210 //check for optional parameter and set defaults
211 // ...at some point should added some additional type checking...
213 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
214 keepprimer = m->isTrue(temp);
216 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
217 keepdots = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "oligos", true);
220 if (temp == "not found"){ oligosfile = ""; }
221 else if(temp == "not open"){ oligosfile = ""; abort = true; }
222 else { oligosfile = temp; m->setOligosFile(oligosfile); }
224 ecolifile = validParameter.validFile(parameters, "ecoli", true);
225 if (ecolifile == "not found"){ ecolifile = ""; }
226 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
228 namefile = validParameter.validFile(parameters, "name", true);
229 if (namefile == "not found"){ namefile = ""; }
230 else if(namefile == "not open"){ namefile = ""; abort = true; }
231 else { m->setNameFile(namefile); }
233 groupfile = validParameter.validFile(parameters, "group", true);
234 if (groupfile == "not found"){ groupfile = ""; }
235 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
236 else { m->setGroupFile(groupfile); }
238 countfile = validParameter.validFile(parameters, "count", true);
239 if (countfile == "not open") { countfile = ""; abort = true; }
240 else if (countfile == "not found") { countfile = ""; }
241 else { m->setCountTableFile(countfile); }
243 if ((namefile != "") && (countfile != "")) {
244 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
247 if ((groupfile != "") && (countfile != "")) {
248 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
251 taxfile = validParameter.validFile(parameters, "taxonomy", true);
252 if (taxfile == "not found"){ taxfile = ""; }
253 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
254 else { m->setTaxonomyFile(taxfile); }
256 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
257 m->mothurConvert(temp, start);
259 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, end);
262 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
263 m->setProcessors(temp);
264 m->mothurConvert(temp, processors);
266 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
268 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
271 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
272 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
275 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
277 if ((ecolifile != "") && (start != -1) && (end != -1)) {
278 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
282 if ((oligosfile != "") && (ecolifile != "")) {
283 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
286 //check to make sure you didn't forget the name file by mistake
287 if (countfile == "") {
288 if (namefile == "") {
289 vector<string> files; files.push_back(fastafile);
290 parser.getNameFile(files);
296 catch(exception& e) {
297 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
301 //***************************************************************************************************************
303 int PcrSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 int start = time(NULL);
310 string thisOutputDir = outputDir;
311 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
312 map<string, string> variables;
313 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
314 variables["[extension]"] = m->getExtension(fastafile);
315 string trimSeqFile = getOutputFileName("fasta",variables);
316 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 variables["[tag]"] = "scrap";
318 string badSeqFile = getOutputFileName("fasta",variables);
322 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
323 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
325 vector<unsigned long long> positions;
326 int numFastaSeqs = 0;
327 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
328 positions = m->divideFile(fastafile, processors);
329 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
331 if (processors == 1) {
332 lines.push_back(linePair(0, 1000));
334 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
335 if (positions.size() < processors) { processors = positions.size(); }
337 //figure out how many sequences you have to process
338 int numSeqsPerProcessor = numFastaSeqs / processors;
339 for (int i = 0; i < processors; i++) {
340 int startIndex = i * numSeqsPerProcessor;
341 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
342 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
346 if (m->control_pressed) { return 0; }
348 set<string> badNames;
349 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
350 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
352 if (m->control_pressed) { return 0; }
354 //don't write or keep if blank
355 if (badNames.size() != 0) { writeAccnos(badNames); }
356 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
357 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
359 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
360 if (namefile != "") { readName(badNames); }
361 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
362 if (groupfile != "") { readGroup(badNames); }
363 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
364 if (taxfile != "") { readTax(badNames); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (countfile != "") { readCount(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
369 m->mothurOutEndLine();
370 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
371 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
372 m->mothurOutEndLine();
373 m->mothurOutEndLine();
375 //set fasta file as new current fastafile
377 itTypes = outputTypes.find("fasta");
378 if (itTypes != outputTypes.end()) {
379 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
382 itTypes = outputTypes.find("name");
383 if (itTypes != outputTypes.end()) {
384 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
387 itTypes = outputTypes.find("group");
388 if (itTypes != outputTypes.end()) {
389 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
392 itTypes = outputTypes.find("accnos");
393 if (itTypes != outputTypes.end()) {
394 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
397 itTypes = outputTypes.find("taxonomy");
398 if (itTypes != outputTypes.end()) {
399 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
402 itTypes = outputTypes.find("count");
403 if (itTypes != outputTypes.end()) {
404 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
407 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
408 m->mothurOutEndLine();
414 catch(exception& e) {
415 m->errorOut(e, "PcrSeqsCommand", "execute");
419 /**************************************************************************************************/
420 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
423 vector<int> processIDS;
427 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
429 //loop through and create all the processes you want
430 while (process != processors) {
434 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
437 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
439 //pass numSeqs to parent
441 string tempFile = filename + toString(getpid()) + ".num.temp";
442 m->openOutputFile(tempFile, out);
443 out << num << '\t' << badSeqNames.size() << endl;
444 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
445 out << (*it) << endl;
451 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
452 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
457 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
459 //force parent to wait until all the processes are done
460 for (int i=0;i<processIDS.size();i++) {
461 int temp = processIDS[i];
465 for (int i = 0; i < processIDS.size(); i++) {
467 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
468 m->openInputFile(tempFile, in);
469 int numBadNames = 0; string name = "";
470 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
471 for (int j = 0; j < numBadNames; j++) {
472 in >> name; m->gobble(in);
473 badSeqNames.insert(name);
475 in.close(); m->mothurRemove(tempFile);
477 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
478 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
480 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
481 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
485 //////////////////////////////////////////////////////////////////////////////////////////////////////
486 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
487 //Above fork() will clone, so memory is separate, but that's not the case with windows,
488 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
489 //////////////////////////////////////////////////////////////////////////////////////////////////////
491 vector<pcrData*> pDataArray;
492 DWORD dwThreadIdArray[processors-1];
493 HANDLE hThreadArray[processors-1];
495 //Create processor worker threads.
496 for( int i=0; i<processors-1; i++ ){
498 string extension = "";
499 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
501 // Allocate memory for thread data.
502 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
503 pDataArray.push_back(tempPcr);
505 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
506 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
510 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
511 processIDS.push_back(processors-1);
513 //Wait until all threads have terminated.
514 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
516 //Close all thread handles and free memory allocations.
517 for(int i=0; i < pDataArray.size(); i++){
518 num += pDataArray[i]->count;
519 if (pDataArray[i]->count != pDataArray[i]->fend) {
520 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
522 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
523 CloseHandle(hThreadArray[i]);
524 delete pDataArray[i];
527 for (int i = 0; i < processIDS.size(); i++) {
528 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
529 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
531 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
532 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
540 catch(exception& e) {
541 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
546 //**********************************************************************************************************************
547 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
550 m->openOutputFile(goodFasta, goodFile);
553 m->openOutputFile(badFasta, badFile);
556 m->openInputFile(filename, inFASTA);
558 inFASTA.seekg(filePos.start);
566 if (m->control_pressed) { break; }
568 Sequence currSeq(inFASTA); m->gobble(inFASTA);
570 string trashCode = "";
571 if (currSeq.getName() != "") {
574 if (oligosfile != "") {
575 map<int, int> mapAligned;
576 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
579 if (primers.size() != 0) {
580 int primerStart = 0; int primerEnd = 0;
581 bool good = findForward(currSeq, primerStart, primerEnd);
583 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
588 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
589 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
592 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
593 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
596 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
597 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
602 //process reverse primers
603 if (revPrimer.size() != 0) {
604 int primerStart = 0; int primerEnd = 0;
605 bool good = findReverse(currSeq, primerStart, primerEnd);
606 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
611 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
612 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
615 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
616 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
620 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
621 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
625 }else if (ecolifile != "") {
626 //make sure the seqs are aligned
627 lengths.insert(currSeq.getAligned().length());
628 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
629 else if (currSeq.getAligned().length() != length) {
630 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
633 currSeq.filterToPos(start);
634 currSeq.filterFromPos(end);
636 string seqString = currSeq.getAligned().substr(0, end);
637 seqString = seqString.substr(start);
638 currSeq.setAligned(seqString);
641 }else{ //using start and end to trim
642 //make sure the seqs are aligned
643 lengths.insert(currSeq.getAligned().length());
644 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
647 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
649 if (keepdots) { currSeq.filterFromPos(end); }
651 string seqString = currSeq.getAligned().substr(0, end);
652 currSeq.setAligned(seqString);
657 if (keepdots) { currSeq.filterToPos(start); }
659 string seqString = currSeq.getAligned().substr(start);
660 currSeq.setAligned(seqString);
666 //trimming removed all bases
667 if (currSeq.getUnaligned() == "") { goodSeq = false; }
669 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
671 badSeqNames.insert(currSeq.getName());
672 currSeq.setName(currSeq.getName() + '|' + trashCode);
673 currSeq.printSequence(badFile);
678 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
679 unsigned long long pos = inFASTA.tellg();
680 if ((pos == -1) || (pos >= filePos.end)) { break; }
682 if (inFASTA.eof()) { break; }
686 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
689 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
697 catch(exception& e) {
698 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
702 //********************************************************************/
703 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
706 string rawSequence = seq.getUnaligned();
708 for(int j=0;j<primers.size();j++){
709 string oligo = primers[j];
711 if(rawSequence.length() < oligo.length()) { break; }
714 int olength = oligo.length();
715 for (int j = 0; j < rawSequence.length()-olength; j++){
716 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
717 string rawChunk = rawSequence.substr(j, olength);
718 if(compareDNASeq(oligo, rawChunk)) {
720 primerEnd = primerStart + olength;
727 primerStart = 0; primerEnd = 0;
731 catch(exception& e) {
732 m->errorOut(e, "TrimOligos", "stripForward");
736 //******************************************************************/
737 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
740 string rawSequence = seq.getUnaligned();
742 for(int i=0;i<revPrimer.size();i++){
743 string oligo = revPrimer[i];
744 if(rawSequence.length() < oligo.length()) { break; }
747 int olength = oligo.length();
748 for (int j = rawSequence.length()-olength; j >= 0; j--){
749 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
750 string rawChunk = rawSequence.substr(j, olength);
752 if(compareDNASeq(oligo, rawChunk)) {
754 primerEnd = primerStart + olength;
761 primerStart = 0; primerEnd = 0;
764 catch(exception& e) {
765 m->errorOut(e, "PcrSeqsCommand", "findReverse");
769 //********************************************************************/
770 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
772 bool isAligned = false;
775 for (int i = 0; i < seq.length(); i++) {
776 if (!isalpha(seq[i])) { isAligned = true; }
777 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
778 } //ie. the 3rd base may be at spot 10 in the alignment
779 //later when we trim we want to trim from spot 10.
782 catch(exception& e) {
783 m->errorOut(e, "PcrSeqsCommand", "isAligned");
787 //********************************************************************/
788 string PcrSeqsCommand::reverseOligo(string oligo){
792 for(int i=oligo.length()-1;i>=0;i--){
794 if(oligo[i] == 'A') { reverse += 'T'; }
795 else if(oligo[i] == 'T'){ reverse += 'A'; }
796 else if(oligo[i] == 'U'){ reverse += 'A'; }
798 else if(oligo[i] == 'G'){ reverse += 'C'; }
799 else if(oligo[i] == 'C'){ reverse += 'G'; }
801 else if(oligo[i] == 'R'){ reverse += 'Y'; }
802 else if(oligo[i] == 'Y'){ reverse += 'R'; }
804 else if(oligo[i] == 'M'){ reverse += 'K'; }
805 else if(oligo[i] == 'K'){ reverse += 'M'; }
807 else if(oligo[i] == 'W'){ reverse += 'W'; }
808 else if(oligo[i] == 'S'){ reverse += 'S'; }
810 else if(oligo[i] == 'B'){ reverse += 'V'; }
811 else if(oligo[i] == 'V'){ reverse += 'B'; }
813 else if(oligo[i] == 'D'){ reverse += 'H'; }
814 else if(oligo[i] == 'H'){ reverse += 'D'; }
816 else { reverse += 'N'; }
822 catch(exception& e) {
823 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
828 //***************************************************************************************************************
829 bool PcrSeqsCommand::readOligos(){
832 m->openInputFile(oligosfile, inOligos);
834 string type, oligo, group;
836 while(!inOligos.eof()){
840 if(type[0] == '#'){ //ignore
841 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
845 //make type case insensitive
846 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
850 for(int i=0;i<oligo.length();i++){
851 oligo[i] = toupper(oligo[i]);
852 if(oligo[i] == 'U') { oligo[i] = 'T'; }
855 if(type == "FORWARD"){
856 // get rest of line in case there is a primer name
857 while (!inOligos.eof()) {
858 char c = inOligos.get();
859 if (c == 10 || c == 13){ break; }
860 else if (c == 32 || c == 9){;} //space or tab
862 primers.push_back(oligo);
863 }else if(type == "REVERSE"){
864 string oligoRC = reverseOligo(oligo);
865 revPrimer.push_back(oligoRC);
866 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
867 }else if(type == "BARCODE"){
869 }else if((type == "LINKER")||(type == "SPACER")) {;}
870 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
876 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
877 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
878 m->control_pressed = true;
884 }catch(exception& e) {
885 m->errorOut(e, "PcrSeqsCommand", "readOligos");
889 //***************************************************************************************************************
890 bool PcrSeqsCommand::readEcoli(){
893 m->openInputFile(ecolifile, in);
898 length = ecoli.getAligned().length();
899 start = ecoli.getStartPos();
900 end = ecoli.getEndPos();
901 }else { in.close(); m->control_pressed = true; return false; }
906 catch(exception& e) {
907 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
912 //***************************************************************************************************************
913 int PcrSeqsCommand::writeAccnos(set<string> badNames){
915 string thisOutputDir = outputDir;
916 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
917 map<string, string> variables;
918 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
919 string outputFileName = getOutputFileName("accnos",variables);
920 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
923 m->openOutputFile(outputFileName, out);
925 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
926 if (m->control_pressed) { break; }
927 out << (*it) << endl;
933 catch(exception& e) {
934 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
939 //******************************************************************/
940 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
943 int length = oligo.length();
945 for(int i=0;i<length;i++){
947 if(oligo[i] != seq[i]){
948 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
949 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
950 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
951 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
952 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
953 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
954 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
955 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
956 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
957 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
958 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
959 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
961 if(success == 0) { break; }
970 catch(exception& e) {
971 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
976 //***************************************************************************************************************
977 int PcrSeqsCommand::readName(set<string>& names){
979 string thisOutputDir = outputDir;
980 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
981 map<string, string> variables;
982 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
983 variables["[extension]"] = m->getExtension(namefile);
984 string outputFileName = getOutputFileName("name", variables);
987 m->openOutputFile(outputFileName, out);
990 m->openInputFile(namefile, in);
991 string name, firstCol, secondCol;
993 bool wroteSomething = false;
994 int removedCount = 0;
997 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
999 in >> firstCol; m->gobble(in);
1002 string savedSecond = secondCol;
1003 vector<string> parsedNames;
1004 m->splitAtComma(secondCol, parsedNames);
1006 vector<string> validSecond; validSecond.clear();
1007 for (int i = 0; i < parsedNames.size(); i++) {
1008 if (names.count(parsedNames[i]) == 0) {
1009 validSecond.push_back(parsedNames[i]);
1013 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1014 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1015 removedCount += parsedNames.size();
1017 out << firstCol << '\t' << savedSecond << endl;
1018 wroteSomething = true;
1025 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1026 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1028 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1032 catch(exception& e) {
1033 m->errorOut(e, "PcrSeqsCommand", "readName");
1037 //**********************************************************************************************************************
1038 int PcrSeqsCommand::readGroup(set<string> names){
1040 string thisOutputDir = outputDir;
1041 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1042 map<string, string> variables;
1043 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1044 variables["[extension]"] = m->getExtension(groupfile);
1045 string outputFileName = getOutputFileName("group", variables);
1048 m->openOutputFile(outputFileName, out);
1051 m->openInputFile(groupfile, in);
1054 bool wroteSomething = false;
1055 int removedCount = 0;
1058 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1060 in >> name; //read from first column
1061 in >> group; //read from second column
1063 //if this name is in the accnos file
1064 if (names.count(name) == 0) {
1065 wroteSomething = true;
1066 out << name << '\t' << group << endl;
1067 }else { removedCount++; }
1074 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1075 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1077 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1082 catch(exception& e) {
1083 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1087 //**********************************************************************************************************************
1088 int PcrSeqsCommand::readTax(set<string> names){
1090 string thisOutputDir = outputDir;
1091 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1092 map<string, string> variables;
1093 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1094 variables["[extension]"] = m->getExtension(taxfile);
1095 string outputFileName = getOutputFileName("taxonomy", variables);
1098 m->openOutputFile(outputFileName, out);
1101 m->openInputFile(taxfile, in);
1104 bool wroteSomething = false;
1105 int removedCount = 0;
1108 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1110 in >> name; //read from first column
1111 in >> tax; //read from second column
1113 //if this name is in the accnos file
1114 if (names.count(name) == 0) {
1115 wroteSomething = true;
1116 out << name << '\t' << tax << endl;
1117 }else { removedCount++; }
1124 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1125 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1127 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1131 catch(exception& e) {
1132 m->errorOut(e, "PcrSeqsCommand", "readTax");
1136 //***************************************************************************************************************
1137 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1140 m->openInputFile(countfile, in);
1141 set<string>::iterator it;
1143 map<string, string> variables;
1144 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1145 variables["[extension]"] = m->getExtension(countfile);
1146 string goodCountFile = getOutputFileName("count", variables);
1148 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1149 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1151 string headers = m->getline(in); m->gobble(in);
1152 goodCountOut << headers << endl;
1154 string name, rest; int thisTotal, removedCount; removedCount = 0;
1155 bool wroteSomething = false;
1158 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1160 in >> name; m->gobble(in);
1161 in >> thisTotal; m->gobble(in);
1162 rest = m->getline(in); m->gobble(in);
1164 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1166 wroteSomething = true;
1167 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1171 goodCountOut.close();
1173 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1175 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1177 //check for groups that have been eliminated
1179 if (ct.testGroups(goodCountFile)) {
1180 ct.readTable(goodCountFile);
1181 ct.printTable(goodCountFile);
1184 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1186 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1192 catch(exception& e) {
1193 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1197 /**************************************************************************************/