5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "PcrSeqsCommand", "setParameters");
38 //**********************************************************************************************************************
39 string PcrSeqsCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The pcr.seqs command reads a fasta file ...\n";
44 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
45 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
49 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
55 //**********************************************************************************************************************
57 PcrSeqsCommand::PcrSeqsCommand(){
59 abort = true; calledHelp = true;
61 vector<string> tempOutNames;
62 outputTypes["fasta"] = tempOutNames;
63 outputTypes["taxonomy"] = tempOutNames;
64 outputTypes["group"] = tempOutNames;
65 outputTypes["name"] = tempOutNames;
66 outputTypes["accnos"] = tempOutNames;
69 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
73 //***************************************************************************************************************
75 PcrSeqsCommand::PcrSeqsCommand(string option) {
78 abort = false; calledHelp = false;
80 //allow user to run help
81 if(option == "help") { help(); abort = true; calledHelp = true; }
82 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
85 vector<string> myArray = setParameters();
87 OptionParser parser(option);
88 map<string,string> parameters = parser.getParameters();
90 ValidParameters validParameter;
91 map<string,string>::iterator it;
93 //check to make sure all parameters are valid for command
94 for (it = parameters.begin(); it != parameters.end(); it++) {
95 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
98 //initialize outputTypes
99 vector<string> tempOutNames;
100 outputTypes["fasta"] = tempOutNames;
101 outputTypes["taxonomy"] = tempOutNames;
102 outputTypes["group"] = tempOutNames;
103 outputTypes["name"] = tempOutNames;
104 outputTypes["accnos"] = tempOutNames;
106 //if the user changes the input directory command factory will send this info to us in the output parameter
107 string inputDir = validParameter.validFile(parameters, "inputdir", false);
108 if (inputDir == "not found"){ inputDir = ""; }
111 it = parameters.find("fasta");
112 //user has given a template file
113 if(it != parameters.end()){
114 path = m->hasPath(it->second);
115 //if the user has not given a path then, add inputdir. else leave path alone.
116 if (path == "") { parameters["fasta"] = inputDir + it->second; }
119 it = parameters.find("oligos");
120 //user has given a template file
121 if(it != parameters.end()){
122 path = m->hasPath(it->second);
123 //if the user has not given a path then, add inputdir. else leave path alone.
124 if (path == "") { parameters["oligos"] = inputDir + it->second; }
127 it = parameters.find("ecoli");
128 //user has given a template file
129 if(it != parameters.end()){
130 path = m->hasPath(it->second);
131 //if the user has not given a path then, add inputdir. else leave path alone.
132 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
135 it = parameters.find("taxonomy");
136 //user has given a template file
137 if(it != parameters.end()){
138 path = m->hasPath(it->second);
139 //if the user has not given a path then, add inputdir. else leave path alone.
140 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
143 it = parameters.find("name");
144 //user has given a template file
145 if(it != parameters.end()){
146 path = m->hasPath(it->second);
147 //if the user has not given a path then, add inputdir. else leave path alone.
148 if (path == "") { parameters["name"] = inputDir + it->second; }
151 it = parameters.find("group");
152 //user has given a template file
153 if(it != parameters.end()){
154 path = m->hasPath(it->second);
155 //if the user has not given a path then, add inputdir. else leave path alone.
156 if (path == "") { parameters["group"] = inputDir + it->second; }
161 //if the user changes the output directory command factory will send this info to us in the output parameter
162 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
164 //check for required parameters
165 fastafile = validParameter.validFile(parameters, "fasta", true);
166 if (fastafile == "not found") {
167 fastafile = m->getFastaFile();
168 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
169 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
170 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
171 else { m->setFastaFile(fastafile); }
174 //check for optional parameter and set defaults
175 // ...at some point should added some additional type checking...
177 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
178 keepprimer = m->isTrue(temp);
180 temp = validParameter.validFile(parameters, "oligos", true);
181 if (temp == "not found"){ oligosfile = ""; }
182 else if(temp == "not open"){ oligosfile = ""; abort = true; }
183 else { oligosfile = temp; m->setOligosFile(oligosfile); }
185 ecolifile = validParameter.validFile(parameters, "ecoli", true);
186 if (ecolifile == "not found"){ ecolifile = ""; }
187 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
189 namefile = validParameter.validFile(parameters, "name", true);
190 if (namefile == "not found"){ namefile = ""; }
191 else if(namefile == "not open"){ namefile = ""; abort = true; }
192 else { m->setNameFile(namefile); }
194 groupfile = validParameter.validFile(parameters, "group", true);
195 if (groupfile == "not found"){ groupfile = ""; }
196 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
197 else { m->setGroupFile(groupfile); }
199 taxfile = validParameter.validFile(parameters, "taxonomy", true);
200 if (taxfile == "not found"){ taxfile = ""; }
201 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
202 else { m->setTaxonomyFile(taxfile); }
204 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
205 m->mothurConvert(temp, pdiffs);
207 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
208 m->mothurConvert(temp, start);
210 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
211 m->mothurConvert(temp, end);
213 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
214 m->setProcessors(temp);
215 m->mothurConvert(temp, processors);
217 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
219 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
222 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
223 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
226 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end != -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
228 if ((ecolifile != "") && (start != -1) && (end != -1)) {
229 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
233 if ((oligosfile != "") && (ecolifile != "")) {
234 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
237 //check to make sure you didn't forget the name file by mistake
238 if (namefile == "") {
239 vector<string> files; files.push_back(fastafile);
240 parser.getNameFile(files);
245 catch(exception& e) {
246 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
250 //***************************************************************************************************************
252 int PcrSeqsCommand::execute(){
255 if (abort == true) { if (calledHelp) { return 0; } return 2; }
257 int start = time(NULL);
259 string thisOutputDir = outputDir;
260 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
261 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
262 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
264 string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
265 outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile);
268 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
269 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
271 vector<unsigned long long> positions;
272 int numFastaSeqs = 0;
273 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
274 positions = m->divideFile(fastafile, processors);
275 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
277 if (processors == 1) {
278 lines.push_back(linePair(0, 1000));
280 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
281 if (positions.size() < processors) { processors = positions.size(); }
283 //figure out how many sequences you have to process
284 int numSeqsPerProcessor = numFastaSeqs / processors;
285 for (int i = 0; i < processors; i++) {
286 int startIndex = i * numSeqsPerProcessor;
287 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
288 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
292 if (m->control_pressed) { return 0; }
294 set<string> badNames;
295 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
296 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
298 if (m->control_pressed) { return 0; }
300 writeAccnos(badNames);
301 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
302 if (namefile != "") { readName(badNames); }
303 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
304 if (groupfile != "") { readGroup(badNames); }
305 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
306 if (taxfile != "") { readTax(badNames); }
307 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
309 m->mothurOutEndLine();
310 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
311 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
312 m->mothurOutEndLine();
313 m->mothurOutEndLine();
315 //set fasta file as new current fastafile
317 itTypes = outputTypes.find("fasta");
318 if (itTypes != outputTypes.end()) {
319 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
322 itTypes = outputTypes.find("name");
323 if (itTypes != outputTypes.end()) {
324 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
327 itTypes = outputTypes.find("group");
328 if (itTypes != outputTypes.end()) {
329 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
332 itTypes = outputTypes.find("accnos");
333 if (itTypes != outputTypes.end()) {
334 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
337 itTypes = outputTypes.find("taxonomy");
338 if (itTypes != outputTypes.end()) {
339 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
342 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
343 m->mothurOutEndLine();
349 catch(exception& e) {
350 m->errorOut(e, "PcrSeqsCommand", "execute");
354 /**************************************************************************************************/
355 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
358 vector<int> processIDS;
362 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
364 //loop through and create all the processes you want
365 while (process != processors) {
369 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
372 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
374 //pass numSeqs to parent
376 string tempFile = filename + toString(getpid()) + ".num.temp";
377 m->openOutputFile(tempFile, out);
378 out << num << '\t' << badSeqNames.size() << endl;
379 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
380 out << (*it) << endl;
386 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
387 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
392 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
394 //force parent to wait until all the processes are done
395 for (int i=0;i<processIDS.size();i++) {
396 int temp = processIDS[i];
400 for (int i = 0; i < processIDS.size(); i++) {
402 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
403 m->openInputFile(tempFile, in);
404 int numBadNames = 0; string name = "";
405 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
406 for (int j = 0; j < numBadNames; j++) {
407 in >> name; m->gobble(in);
408 badSeqNames.insert(name);
410 in.close(); m->mothurRemove(tempFile);
412 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
413 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
415 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
416 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
420 //////////////////////////////////////////////////////////////////////////////////////////////////////
421 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
422 //Above fork() will clone, so memory is separate, but that's not the case with windows,
423 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
424 //////////////////////////////////////////////////////////////////////////////////////////////////////
426 vector<pcrData*> pDataArray;
427 DWORD dwThreadIdArray[processors-1];
428 HANDLE hThreadArray[processors-1];
430 //Create processor worker threads.
431 for( int i=0; i<processors-1; i++ ){
433 string extension = "";
434 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
436 // Allocate memory for thread data.
437 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, start, end, length, lines[i].start, lines[i].end);
438 pDataArray.push_back(tempPcr);
440 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
441 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
445 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
446 processIDS.push_back(processors-1);
448 //Wait until all threads have terminated.
449 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
451 //Close all thread handles and free memory allocations.
452 for(int i=0; i < pDataArray.size(); i++){
453 num += pDataArray[i]->count;
454 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
455 CloseHandle(hThreadArray[i]);
456 delete pDataArray[i];
459 for (int i = 0; i < processIDS.size(); i++) {
460 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
461 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
463 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
464 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
472 catch(exception& e) {
473 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
478 //**********************************************************************************************************************
479 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
482 m->openOutputFile(goodFasta, goodFile);
485 m->openOutputFile(badFasta, badFile);
488 m->openInputFile(filename, inFASTA);
490 inFASTA.seekg(filePos.start);
498 if (m->control_pressed) { break; }
500 Sequence currSeq(inFASTA); m->gobble(inFASTA);
502 string trashCode = "";
503 if (currSeq.getName() != "") {
506 if (oligosfile != "") {
507 map<int, int> mapAligned;
508 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
511 if (primers.size() != 0) {
512 int primerStart = 0; int primerEnd = 0;
513 bool good = findForward(currSeq, primerStart, primerEnd);
515 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
519 if (!keepprimer) { currSeq.padToPos(mapAligned[primerEnd]); }
520 else { currSeq.padToPos(mapAligned[primerStart]); }
522 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
523 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
528 //process reverse primers
529 if (revPrimer.size() != 0) {
530 int primerStart = 0; int primerEnd = 0;
531 bool good = findReverse(currSeq, primerStart, primerEnd);
532 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
536 if (!keepprimer) { currSeq.padFromPos(mapAligned[primerStart]); }
537 else { currSeq.padFromPos(mapAligned[primerEnd]); }
540 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
541 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
545 }else if (ecolifile != "") {
546 //make sure the seqs are aligned
547 lengths.insert(currSeq.getAligned().length());
548 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
549 else if (currSeq.getAligned().length() != length) {
550 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
552 currSeq.padToPos(start);
553 currSeq.padFromPos(end);
555 }else{ //using start and end to trim
556 //make sure the seqs are aligned
557 lengths.insert(currSeq.getAligned().length());
558 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
560 if (start != -1) { currSeq.padToPos(start); }
562 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
564 currSeq.padFromPos(end);
570 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
572 badSeqNames.insert(currSeq.getName());
573 currSeq.setName(currSeq.getName() + '|' + trashCode);
574 currSeq.printSequence(badFile);
579 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
580 unsigned long long pos = inFASTA.tellg();
581 if ((pos == -1) || (pos >= filePos.end)) { break; }
583 if (inFASTA.eof()) { break; }
587 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
590 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
598 catch(exception& e) {
599 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
603 //********************************************************************/
604 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
607 string rawSequence = seq.getUnaligned();
609 for(int j=0;j<primers.size();j++){
610 string oligo = primers[j];
612 if(rawSequence.length() < oligo.length()) { break; }
615 int olength = oligo.length();
616 for (int j = 0; j < rawSequence.length()-olength; j++){
617 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
618 string rawChunk = rawSequence.substr(j, olength);
619 if(compareDNASeq(oligo, rawChunk)) {
621 primerEnd = primerStart + olength;
628 primerStart = 0; primerEnd = 0;
632 catch(exception& e) {
633 m->errorOut(e, "TrimOligos", "stripForward");
637 //******************************************************************/
638 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
641 string rawSequence = seq.getUnaligned();
643 for(int i=0;i<revPrimer.size();i++){
644 string oligo = revPrimer[i];
645 if(rawSequence.length() < oligo.length()) { break; }
648 int olength = oligo.length();
649 for (int j = rawSequence.length()-olength; j >= 0; j--){
650 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
651 string rawChunk = rawSequence.substr(j, olength);
653 if(compareDNASeq(oligo, rawChunk)) {
655 primerEnd = primerStart + olength;
662 primerStart = 0; primerEnd = 0;
665 catch(exception& e) {
666 m->errorOut(e, "PcrSeqsCommand", "findReverse");
670 //********************************************************************/
671 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
673 bool isAligned = false;
676 for (int i = 0; i < seq.length(); i++) {
677 if (!isalpha(seq[i])) { isAligned = true; }
678 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
679 } //ie. the 3rd base may be at spot 10 in the alignment
680 //later when we trim we want to trim from spot 10.
683 catch(exception& e) {
684 m->errorOut(e, "PcrSeqsCommand", "isAligned");
688 //********************************************************************/
689 string PcrSeqsCommand::reverseOligo(string oligo){
693 for(int i=oligo.length()-1;i>=0;i--){
695 if(oligo[i] == 'A') { reverse += 'T'; }
696 else if(oligo[i] == 'T'){ reverse += 'A'; }
697 else if(oligo[i] == 'U'){ reverse += 'A'; }
699 else if(oligo[i] == 'G'){ reverse += 'C'; }
700 else if(oligo[i] == 'C'){ reverse += 'G'; }
702 else if(oligo[i] == 'R'){ reverse += 'Y'; }
703 else if(oligo[i] == 'Y'){ reverse += 'R'; }
705 else if(oligo[i] == 'M'){ reverse += 'K'; }
706 else if(oligo[i] == 'K'){ reverse += 'M'; }
708 else if(oligo[i] == 'W'){ reverse += 'W'; }
709 else if(oligo[i] == 'S'){ reverse += 'S'; }
711 else if(oligo[i] == 'B'){ reverse += 'V'; }
712 else if(oligo[i] == 'V'){ reverse += 'B'; }
714 else if(oligo[i] == 'D'){ reverse += 'H'; }
715 else if(oligo[i] == 'H'){ reverse += 'D'; }
717 else { reverse += 'N'; }
723 catch(exception& e) {
724 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
729 //***************************************************************************************************************
730 bool PcrSeqsCommand::readOligos(){
733 m->openInputFile(oligosfile, inOligos);
735 string type, oligo, group;
737 while(!inOligos.eof()){
741 if(type[0] == '#'){ //ignore
742 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
746 //make type case insensitive
747 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
751 for(int i=0;i<oligo.length();i++){
752 oligo[i] = toupper(oligo[i]);
753 if(oligo[i] == 'U') { oligo[i] = 'T'; }
756 if(type == "FORWARD"){
757 // get rest of line in case there is a primer name
758 while (!inOligos.eof()) {
759 char c = inOligos.get();
760 if (c == 10 || c == 13){ break; }
761 else if (c == 32 || c == 9){;} //space or tab
763 primers.push_back(oligo);
764 }else if(type == "REVERSE"){
765 string oligoRC = reverseOligo(oligo);
766 revPrimer.push_back(oligoRC);
767 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
768 }else if(type == "BARCODE"){
770 }else if((type == "LINKER")||(type == "SPACER")) {;}
771 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
777 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
778 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
779 m->control_pressed = true;
785 }catch(exception& e) {
786 m->errorOut(e, "PcrSeqsCommand", "readOligos");
790 //***************************************************************************************************************
791 bool PcrSeqsCommand::readEcoli(){
794 m->openInputFile(ecolifile, in);
799 length = ecoli.getAligned().length();
800 start = ecoli.getStartPos();
801 end = ecoli.getEndPos();
802 }else { in.close(); m->control_pressed = true; return false; }
807 catch(exception& e) {
808 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
813 //***************************************************************************************************************
814 int PcrSeqsCommand::writeAccnos(set<string> badNames){
816 string thisOutputDir = outputDir;
817 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
818 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
819 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
822 m->openOutputFile(outputFileName, out);
824 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
825 if (m->control_pressed) { break; }
826 out << (*it) << endl;
832 catch(exception& e) {
833 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
838 //******************************************************************/
839 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
842 int length = oligo.length();
844 for(int i=0;i<length;i++){
846 if(oligo[i] != seq[i]){
847 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
848 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
849 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
850 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
851 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
852 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
853 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
854 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
855 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
856 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
857 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
858 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
860 if(success == 0) { break; }
869 catch(exception& e) {
870 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
875 //***************************************************************************************************************
876 int PcrSeqsCommand::readName(set<string>& names){
878 string thisOutputDir = outputDir;
879 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
880 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
883 m->openOutputFile(outputFileName, out);
886 m->openInputFile(namefile, in);
887 string name, firstCol, secondCol;
889 bool wroteSomething = false;
890 int removedCount = 0;
893 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
895 in >> firstCol; m->gobble(in);
898 string savedSecond = secondCol;
899 vector<string> parsedNames;
900 m->splitAtComma(secondCol, parsedNames);
902 vector<string> validSecond; validSecond.clear();
903 for (int i = 0; i < parsedNames.size(); i++) {
904 if (names.count(parsedNames[i]) == 0) {
905 validSecond.push_back(parsedNames[i]);
909 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
910 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
911 removedCount += parsedNames.size();
913 out << firstCol << '\t' << savedSecond << endl;
914 wroteSomething = true;
921 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
922 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
924 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
928 catch(exception& e) {
929 m->errorOut(e, "PcrSeqsCommand", "readName");
933 //**********************************************************************************************************************
934 int PcrSeqsCommand::readGroup(set<string> names){
936 string thisOutputDir = outputDir;
937 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
938 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
941 m->openOutputFile(outputFileName, out);
944 m->openInputFile(groupfile, in);
947 bool wroteSomething = false;
948 int removedCount = 0;
951 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
953 in >> name; //read from first column
954 in >> group; //read from second column
956 //if this name is in the accnos file
957 if (names.count(name) == 0) {
958 wroteSomething = true;
959 out << name << '\t' << group << endl;
960 }else { removedCount++; }
967 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
968 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
970 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
975 catch(exception& e) {
976 m->errorOut(e, "PcrSeqsCommand", "readGroup");
980 //**********************************************************************************************************************
981 int PcrSeqsCommand::readTax(set<string> names){
983 string thisOutputDir = outputDir;
984 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
985 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
987 m->openOutputFile(outputFileName, out);
990 m->openInputFile(taxfile, in);
993 bool wroteSomething = false;
994 int removedCount = 0;
997 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
999 in >> name; //read from first column
1000 in >> tax; //read from second column
1002 //if this name is in the accnos file
1003 if (names.count(name) == 0) {
1004 wroteSomething = true;
1005 out << name << '\t' << tax << endl;
1006 }else { removedCount++; }
1013 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1014 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1016 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1020 catch(exception& e) {
1021 m->errorOut(e, "PcrSeqsCommand", "readTax");
1025 /**************************************************************************************/