5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "PcrSeqsCommand", "setParameters");
39 //**********************************************************************************************************************
40 string PcrSeqsCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The pcr.seqs command reads a fasta file ...\n";
45 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
46 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
50 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
56 //**********************************************************************************************************************
58 PcrSeqsCommand::PcrSeqsCommand(){
60 abort = true; calledHelp = true;
62 vector<string> tempOutNames;
63 outputTypes["fasta"] = tempOutNames;
64 outputTypes["taxonomy"] = tempOutNames;
65 outputTypes["group"] = tempOutNames;
66 outputTypes["name"] = tempOutNames;
67 outputTypes["accnos"] = tempOutNames;
70 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
74 //***************************************************************************************************************
76 PcrSeqsCommand::PcrSeqsCommand(string option) {
79 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
83 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
91 ValidParameters validParameter;
92 map<string,string>::iterator it;
94 //check to make sure all parameters are valid for command
95 for (it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["fasta"] = tempOutNames;
102 outputTypes["taxonomy"] = tempOutNames;
103 outputTypes["group"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
105 outputTypes["accnos"] = tempOutNames;
107 //if the user changes the input directory command factory will send this info to us in the output parameter
108 string inputDir = validParameter.validFile(parameters, "inputdir", false);
109 if (inputDir == "not found"){ inputDir = ""; }
112 it = parameters.find("fasta");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["fasta"] = inputDir + it->second; }
120 it = parameters.find("oligos");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["oligos"] = inputDir + it->second; }
128 it = parameters.find("ecoli");
129 //user has given a template file
130 if(it != parameters.end()){
131 path = m->hasPath(it->second);
132 //if the user has not given a path then, add inputdir. else leave path alone.
133 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
136 it = parameters.find("taxonomy");
137 //user has given a template file
138 if(it != parameters.end()){
139 path = m->hasPath(it->second);
140 //if the user has not given a path then, add inputdir. else leave path alone.
141 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
144 it = parameters.find("name");
145 //user has given a template file
146 if(it != parameters.end()){
147 path = m->hasPath(it->second);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { parameters["name"] = inputDir + it->second; }
152 it = parameters.find("group");
153 //user has given a template file
154 if(it != parameters.end()){
155 path = m->hasPath(it->second);
156 //if the user has not given a path then, add inputdir. else leave path alone.
157 if (path == "") { parameters["group"] = inputDir + it->second; }
162 //if the user changes the output directory command factory will send this info to us in the output parameter
163 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
165 //check for required parameters
166 fastafile = validParameter.validFile(parameters, "fasta", true);
167 if (fastafile == "not found") {
168 fastafile = m->getFastaFile();
169 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
170 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
171 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
172 else { m->setFastaFile(fastafile); }
175 //check for optional parameter and set defaults
176 // ...at some point should added some additional type checking...
178 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
179 keepprimer = m->isTrue(temp);
181 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
182 keepdots = m->isTrue(temp);
184 temp = validParameter.validFile(parameters, "oligos", true);
185 if (temp == "not found"){ oligosfile = ""; }
186 else if(temp == "not open"){ oligosfile = ""; abort = true; }
187 else { oligosfile = temp; m->setOligosFile(oligosfile); }
189 ecolifile = validParameter.validFile(parameters, "ecoli", true);
190 if (ecolifile == "not found"){ ecolifile = ""; }
191 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
193 namefile = validParameter.validFile(parameters, "name", true);
194 if (namefile == "not found"){ namefile = ""; }
195 else if(namefile == "not open"){ namefile = ""; abort = true; }
196 else { m->setNameFile(namefile); }
198 groupfile = validParameter.validFile(parameters, "group", true);
199 if (groupfile == "not found"){ groupfile = ""; }
200 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
201 else { m->setGroupFile(groupfile); }
203 taxfile = validParameter.validFile(parameters, "taxonomy", true);
204 if (taxfile == "not found"){ taxfile = ""; }
205 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
206 else { m->setTaxonomyFile(taxfile); }
208 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
209 m->mothurConvert(temp, pdiffs);
211 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
212 m->mothurConvert(temp, start);
214 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
215 m->mothurConvert(temp, end);
217 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
218 m->setProcessors(temp);
219 m->mothurConvert(temp, processors);
221 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
223 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
226 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
227 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
230 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
232 if ((ecolifile != "") && (start != -1) && (end != -1)) {
233 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
237 if ((oligosfile != "") && (ecolifile != "")) {
238 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
241 //check to make sure you didn't forget the name file by mistake
242 if (namefile == "") {
243 vector<string> files; files.push_back(fastafile);
244 parser.getNameFile(files);
249 catch(exception& e) {
250 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
254 //***************************************************************************************************************
256 int PcrSeqsCommand::execute(){
259 if (abort == true) { if (calledHelp) { return 0; } return 2; }
261 int start = time(NULL);
263 string thisOutputDir = outputDir;
264 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
265 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
266 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
268 string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
269 outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile);
272 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
273 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
275 vector<unsigned long long> positions;
276 int numFastaSeqs = 0;
277 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
278 positions = m->divideFile(fastafile, processors);
279 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
281 if (processors == 1) {
282 lines.push_back(linePair(0, 1000));
284 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
285 if (positions.size() < processors) { processors = positions.size(); }
287 //figure out how many sequences you have to process
288 int numSeqsPerProcessor = numFastaSeqs / processors;
289 for (int i = 0; i < processors; i++) {
290 int startIndex = i * numSeqsPerProcessor;
291 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
292 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
296 if (m->control_pressed) { return 0; }
298 set<string> badNames;
299 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
300 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
302 if (m->control_pressed) { return 0; }
304 //don't write or keep if blank
305 if (badNames.size() != 0) { writeAccnos(badNames); }
306 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
308 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
309 if (namefile != "") { readName(badNames); }
310 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
311 if (groupfile != "") { readGroup(badNames); }
312 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
313 if (taxfile != "") { readTax(badNames); }
314 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
316 m->mothurOutEndLine();
317 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
318 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
319 m->mothurOutEndLine();
320 m->mothurOutEndLine();
322 //set fasta file as new current fastafile
324 itTypes = outputTypes.find("fasta");
325 if (itTypes != outputTypes.end()) {
326 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
329 itTypes = outputTypes.find("name");
330 if (itTypes != outputTypes.end()) {
331 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
334 itTypes = outputTypes.find("group");
335 if (itTypes != outputTypes.end()) {
336 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
339 itTypes = outputTypes.find("accnos");
340 if (itTypes != outputTypes.end()) {
341 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
344 itTypes = outputTypes.find("taxonomy");
345 if (itTypes != outputTypes.end()) {
346 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
349 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
350 m->mothurOutEndLine();
356 catch(exception& e) {
357 m->errorOut(e, "PcrSeqsCommand", "execute");
361 /**************************************************************************************************/
362 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
365 vector<int> processIDS;
369 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
371 //loop through and create all the processes you want
372 while (process != processors) {
376 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
379 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
381 //pass numSeqs to parent
383 string tempFile = filename + toString(getpid()) + ".num.temp";
384 m->openOutputFile(tempFile, out);
385 out << num << '\t' << badSeqNames.size() << endl;
386 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
387 out << (*it) << endl;
393 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
394 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
399 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
401 //force parent to wait until all the processes are done
402 for (int i=0;i<processIDS.size();i++) {
403 int temp = processIDS[i];
407 for (int i = 0; i < processIDS.size(); i++) {
409 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
410 m->openInputFile(tempFile, in);
411 int numBadNames = 0; string name = "";
412 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
413 for (int j = 0; j < numBadNames; j++) {
414 in >> name; m->gobble(in);
415 badSeqNames.insert(name);
417 in.close(); m->mothurRemove(tempFile);
419 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
420 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
422 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
423 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
427 //////////////////////////////////////////////////////////////////////////////////////////////////////
428 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
429 //Above fork() will clone, so memory is separate, but that's not the case with windows,
430 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
431 //////////////////////////////////////////////////////////////////////////////////////////////////////
433 vector<pcrData*> pDataArray;
434 DWORD dwThreadIdArray[processors-1];
435 HANDLE hThreadArray[processors-1];
437 //Create processor worker threads.
438 for( int i=0; i<processors-1; i++ ){
440 string extension = "";
441 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
443 // Allocate memory for thread data.
444 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
445 pDataArray.push_back(tempPcr);
447 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
448 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
452 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
453 processIDS.push_back(processors-1);
455 //Wait until all threads have terminated.
456 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
458 //Close all thread handles and free memory allocations.
459 for(int i=0; i < pDataArray.size(); i++){
460 num += pDataArray[i]->count;
461 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
462 CloseHandle(hThreadArray[i]);
463 delete pDataArray[i];
466 for (int i = 0; i < processIDS.size(); i++) {
467 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
468 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
470 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
471 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
479 catch(exception& e) {
480 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
485 //**********************************************************************************************************************
486 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
489 m->openOutputFile(goodFasta, goodFile);
492 m->openOutputFile(badFasta, badFile);
495 m->openInputFile(filename, inFASTA);
497 inFASTA.seekg(filePos.start);
505 if (m->control_pressed) { break; }
507 Sequence currSeq(inFASTA); m->gobble(inFASTA);
509 string trashCode = "";
510 if (currSeq.getName() != "") {
513 if (oligosfile != "") {
514 map<int, int> mapAligned;
515 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
518 if (primers.size() != 0) {
519 int primerStart = 0; int primerEnd = 0;
520 bool good = findForward(currSeq, primerStart, primerEnd);
522 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
527 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
528 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
531 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
532 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
535 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
536 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
541 //process reverse primers
542 if (revPrimer.size() != 0) {
543 int primerStart = 0; int primerEnd = 0;
544 bool good = findReverse(currSeq, primerStart, primerEnd);
545 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
550 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
551 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
554 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
555 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
559 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
560 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
564 }else if (ecolifile != "") {
565 //make sure the seqs are aligned
566 lengths.insert(currSeq.getAligned().length());
567 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
568 else if (currSeq.getAligned().length() != length) {
569 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
572 currSeq.filterToPos(start);
573 currSeq.filterFromPos(end);
575 string seqString = currSeq.getAligned().substr(0, end);
576 seqString = seqString.substr(start);
577 currSeq.setAligned(seqString);
580 }else{ //using start and end to trim
581 //make sure the seqs are aligned
582 lengths.insert(currSeq.getAligned().length());
583 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
586 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
588 if (keepdots) { currSeq.filterFromPos(end); }
590 string seqString = currSeq.getAligned().substr(0, end);
591 currSeq.setAligned(seqString);
596 if (keepdots) { currSeq.filterToPos(start); }
598 string seqString = currSeq.getAligned().substr(start);
599 currSeq.setAligned(seqString);
605 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
607 badSeqNames.insert(currSeq.getName());
608 currSeq.setName(currSeq.getName() + '|' + trashCode);
609 currSeq.printSequence(badFile);
614 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
615 unsigned long long pos = inFASTA.tellg();
616 if ((pos == -1) || (pos >= filePos.end)) { break; }
618 if (inFASTA.eof()) { break; }
622 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
625 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
633 catch(exception& e) {
634 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
638 //********************************************************************/
639 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
642 string rawSequence = seq.getUnaligned();
644 for(int j=0;j<primers.size();j++){
645 string oligo = primers[j];
647 if(rawSequence.length() < oligo.length()) { break; }
650 int olength = oligo.length();
651 for (int j = 0; j < rawSequence.length()-olength; j++){
652 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
653 string rawChunk = rawSequence.substr(j, olength);
654 if(compareDNASeq(oligo, rawChunk)) {
656 primerEnd = primerStart + olength;
663 primerStart = 0; primerEnd = 0;
667 catch(exception& e) {
668 m->errorOut(e, "TrimOligos", "stripForward");
672 //******************************************************************/
673 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
676 string rawSequence = seq.getUnaligned();
678 for(int i=0;i<revPrimer.size();i++){
679 string oligo = revPrimer[i];
680 if(rawSequence.length() < oligo.length()) { break; }
683 int olength = oligo.length();
684 for (int j = rawSequence.length()-olength; j >= 0; j--){
685 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
686 string rawChunk = rawSequence.substr(j, olength);
688 if(compareDNASeq(oligo, rawChunk)) {
690 primerEnd = primerStart + olength;
697 primerStart = 0; primerEnd = 0;
700 catch(exception& e) {
701 m->errorOut(e, "PcrSeqsCommand", "findReverse");
705 //********************************************************************/
706 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
708 bool isAligned = false;
711 for (int i = 0; i < seq.length(); i++) {
712 if (!isalpha(seq[i])) { isAligned = true; }
713 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
714 } //ie. the 3rd base may be at spot 10 in the alignment
715 //later when we trim we want to trim from spot 10.
718 catch(exception& e) {
719 m->errorOut(e, "PcrSeqsCommand", "isAligned");
723 //********************************************************************/
724 string PcrSeqsCommand::reverseOligo(string oligo){
728 for(int i=oligo.length()-1;i>=0;i--){
730 if(oligo[i] == 'A') { reverse += 'T'; }
731 else if(oligo[i] == 'T'){ reverse += 'A'; }
732 else if(oligo[i] == 'U'){ reverse += 'A'; }
734 else if(oligo[i] == 'G'){ reverse += 'C'; }
735 else if(oligo[i] == 'C'){ reverse += 'G'; }
737 else if(oligo[i] == 'R'){ reverse += 'Y'; }
738 else if(oligo[i] == 'Y'){ reverse += 'R'; }
740 else if(oligo[i] == 'M'){ reverse += 'K'; }
741 else if(oligo[i] == 'K'){ reverse += 'M'; }
743 else if(oligo[i] == 'W'){ reverse += 'W'; }
744 else if(oligo[i] == 'S'){ reverse += 'S'; }
746 else if(oligo[i] == 'B'){ reverse += 'V'; }
747 else if(oligo[i] == 'V'){ reverse += 'B'; }
749 else if(oligo[i] == 'D'){ reverse += 'H'; }
750 else if(oligo[i] == 'H'){ reverse += 'D'; }
752 else { reverse += 'N'; }
758 catch(exception& e) {
759 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
764 //***************************************************************************************************************
765 bool PcrSeqsCommand::readOligos(){
768 m->openInputFile(oligosfile, inOligos);
770 string type, oligo, group;
772 while(!inOligos.eof()){
776 if(type[0] == '#'){ //ignore
777 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
781 //make type case insensitive
782 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
786 for(int i=0;i<oligo.length();i++){
787 oligo[i] = toupper(oligo[i]);
788 if(oligo[i] == 'U') { oligo[i] = 'T'; }
791 if(type == "FORWARD"){
792 // get rest of line in case there is a primer name
793 while (!inOligos.eof()) {
794 char c = inOligos.get();
795 if (c == 10 || c == 13){ break; }
796 else if (c == 32 || c == 9){;} //space or tab
798 primers.push_back(oligo);
799 }else if(type == "REVERSE"){
800 string oligoRC = reverseOligo(oligo);
801 revPrimer.push_back(oligoRC);
802 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
803 }else if(type == "BARCODE"){
805 }else if((type == "LINKER")||(type == "SPACER")) {;}
806 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
812 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
813 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
814 m->control_pressed = true;
820 }catch(exception& e) {
821 m->errorOut(e, "PcrSeqsCommand", "readOligos");
825 //***************************************************************************************************************
826 bool PcrSeqsCommand::readEcoli(){
829 m->openInputFile(ecolifile, in);
834 length = ecoli.getAligned().length();
835 start = ecoli.getStartPos();
836 end = ecoli.getEndPos();
837 }else { in.close(); m->control_pressed = true; return false; }
842 catch(exception& e) {
843 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
848 //***************************************************************************************************************
849 int PcrSeqsCommand::writeAccnos(set<string> badNames){
851 string thisOutputDir = outputDir;
852 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
853 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
854 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
857 m->openOutputFile(outputFileName, out);
859 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
860 if (m->control_pressed) { break; }
861 out << (*it) << endl;
867 catch(exception& e) {
868 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
873 //******************************************************************/
874 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
877 int length = oligo.length();
879 for(int i=0;i<length;i++){
881 if(oligo[i] != seq[i]){
882 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
883 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
884 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
885 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
886 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
887 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
888 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
889 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
890 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
891 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
892 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
893 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
895 if(success == 0) { break; }
904 catch(exception& e) {
905 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
910 //***************************************************************************************************************
911 int PcrSeqsCommand::readName(set<string>& names){
913 string thisOutputDir = outputDir;
914 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
915 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
918 m->openOutputFile(outputFileName, out);
921 m->openInputFile(namefile, in);
922 string name, firstCol, secondCol;
924 bool wroteSomething = false;
925 int removedCount = 0;
928 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
930 in >> firstCol; m->gobble(in);
933 string savedSecond = secondCol;
934 vector<string> parsedNames;
935 m->splitAtComma(secondCol, parsedNames);
937 vector<string> validSecond; validSecond.clear();
938 for (int i = 0; i < parsedNames.size(); i++) {
939 if (names.count(parsedNames[i]) == 0) {
940 validSecond.push_back(parsedNames[i]);
944 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
945 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
946 removedCount += parsedNames.size();
948 out << firstCol << '\t' << savedSecond << endl;
949 wroteSomething = true;
956 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
957 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
959 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
963 catch(exception& e) {
964 m->errorOut(e, "PcrSeqsCommand", "readName");
968 //**********************************************************************************************************************
969 int PcrSeqsCommand::readGroup(set<string> names){
971 string thisOutputDir = outputDir;
972 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
973 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
976 m->openOutputFile(outputFileName, out);
979 m->openInputFile(groupfile, in);
982 bool wroteSomething = false;
983 int removedCount = 0;
986 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
988 in >> name; //read from first column
989 in >> group; //read from second column
991 //if this name is in the accnos file
992 if (names.count(name) == 0) {
993 wroteSomething = true;
994 out << name << '\t' << group << endl;
995 }else { removedCount++; }
1002 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1003 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1005 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1010 catch(exception& e) {
1011 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1015 //**********************************************************************************************************************
1016 int PcrSeqsCommand::readTax(set<string> names){
1018 string thisOutputDir = outputDir;
1019 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1020 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
1022 m->openOutputFile(outputFileName, out);
1025 m->openInputFile(taxfile, in);
1028 bool wroteSomething = false;
1029 int removedCount = 0;
1032 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1034 in >> name; //read from first column
1035 in >> tax; //read from second column
1037 //if this name is in the accnos file
1038 if (names.count(name) == 0) {
1039 wroteSomething = true;
1040 out << name << '\t' << tax << endl;
1041 }else { removedCount++; }
1048 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1049 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1051 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1055 catch(exception& e) {
1056 m->errorOut(e, "PcrSeqsCommand", "readTax");
1060 /**************************************************************************************/