5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "",false,false); parameters.push_back(ppdiffs);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "PcrSeqsCommand", "setParameters");
39 //**********************************************************************************************************************
40 string PcrSeqsCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The pcr.seqs command reads a fasta file ...\n";
45 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
46 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
50 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
56 //**********************************************************************************************************************
58 PcrSeqsCommand::PcrSeqsCommand(){
60 abort = true; calledHelp = true;
62 vector<string> tempOutNames;
63 outputTypes["fasta"] = tempOutNames;
64 outputTypes["taxonomy"] = tempOutNames;
65 outputTypes["group"] = tempOutNames;
66 outputTypes["name"] = tempOutNames;
67 outputTypes["accnos"] = tempOutNames;
70 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
74 //***************************************************************************************************************
76 PcrSeqsCommand::PcrSeqsCommand(string option) {
79 abort = false; calledHelp = false;
81 //allow user to run help
82 if(option == "help") { help(); abort = true; calledHelp = true; }
83 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
86 vector<string> myArray = setParameters();
88 OptionParser parser(option);
89 map<string,string> parameters = parser.getParameters();
91 ValidParameters validParameter;
92 map<string,string>::iterator it;
94 //check to make sure all parameters are valid for command
95 for (it = parameters.begin(); it != parameters.end(); it++) {
96 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
99 //initialize outputTypes
100 vector<string> tempOutNames;
101 outputTypes["fasta"] = tempOutNames;
102 outputTypes["taxonomy"] = tempOutNames;
103 outputTypes["group"] = tempOutNames;
104 outputTypes["name"] = tempOutNames;
105 outputTypes["accnos"] = tempOutNames;
107 //if the user changes the input directory command factory will send this info to us in the output parameter
108 string inputDir = validParameter.validFile(parameters, "inputdir", false);
109 if (inputDir == "not found"){ inputDir = ""; }
112 it = parameters.find("fasta");
113 //user has given a template file
114 if(it != parameters.end()){
115 path = m->hasPath(it->second);
116 //if the user has not given a path then, add inputdir. else leave path alone.
117 if (path == "") { parameters["fasta"] = inputDir + it->second; }
120 it = parameters.find("oligos");
121 //user has given a template file
122 if(it != parameters.end()){
123 path = m->hasPath(it->second);
124 //if the user has not given a path then, add inputdir. else leave path alone.
125 if (path == "") { parameters["oligos"] = inputDir + it->second; }
128 it = parameters.find("ecoli");
129 //user has given a template file
130 if(it != parameters.end()){
131 path = m->hasPath(it->second);
132 //if the user has not given a path then, add inputdir. else leave path alone.
133 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
136 it = parameters.find("taxonomy");
137 //user has given a template file
138 if(it != parameters.end()){
139 path = m->hasPath(it->second);
140 //if the user has not given a path then, add inputdir. else leave path alone.
141 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
144 it = parameters.find("name");
145 //user has given a template file
146 if(it != parameters.end()){
147 path = m->hasPath(it->second);
148 //if the user has not given a path then, add inputdir. else leave path alone.
149 if (path == "") { parameters["name"] = inputDir + it->second; }
152 it = parameters.find("group");
153 //user has given a template file
154 if(it != parameters.end()){
155 path = m->hasPath(it->second);
156 //if the user has not given a path then, add inputdir. else leave path alone.
157 if (path == "") { parameters["group"] = inputDir + it->second; }
162 //if the user changes the output directory command factory will send this info to us in the output parameter
163 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = ""; }
165 //check for required parameters
166 fastafile = validParameter.validFile(parameters, "fasta", true);
167 if (fastafile == "not found") {
168 fastafile = m->getFastaFile();
169 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
170 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
171 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
172 else { m->setFastaFile(fastafile); }
175 //check for optional parameter and set defaults
176 // ...at some point should added some additional type checking...
178 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
179 keepprimer = m->isTrue(temp);
181 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
182 keepdots = m->isTrue(temp);
184 temp = validParameter.validFile(parameters, "oligos", true);
185 if (temp == "not found"){ oligosfile = ""; }
186 else if(temp == "not open"){ oligosfile = ""; abort = true; }
187 else { oligosfile = temp; m->setOligosFile(oligosfile); }
189 ecolifile = validParameter.validFile(parameters, "ecoli", true);
190 if (ecolifile == "not found"){ ecolifile = ""; }
191 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
193 namefile = validParameter.validFile(parameters, "name", true);
194 if (namefile == "not found"){ namefile = ""; }
195 else if(namefile == "not open"){ namefile = ""; abort = true; }
196 else { m->setNameFile(namefile); }
198 groupfile = validParameter.validFile(parameters, "group", true);
199 if (groupfile == "not found"){ groupfile = ""; }
200 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
201 else { m->setGroupFile(groupfile); }
203 taxfile = validParameter.validFile(parameters, "taxonomy", true);
204 if (taxfile == "not found"){ taxfile = ""; }
205 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
206 else { m->setTaxonomyFile(taxfile); }
208 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
209 m->mothurConvert(temp, pdiffs);
211 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
212 m->mothurConvert(temp, start);
214 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
215 m->mothurConvert(temp, end);
217 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
218 m->setProcessors(temp);
219 m->mothurConvert(temp, processors);
221 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
223 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
226 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
227 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
230 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
232 if ((ecolifile != "") && (start != -1) && (end != -1)) {
233 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
237 if ((oligosfile != "") && (ecolifile != "")) {
238 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
241 //check to make sure you didn't forget the name file by mistake
242 if (namefile == "") {
243 vector<string> files; files.push_back(fastafile);
244 parser.getNameFile(files);
249 catch(exception& e) {
250 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
254 //***************************************************************************************************************
256 int PcrSeqsCommand::execute(){
259 if (abort == true) { if (calledHelp) { return 0; } return 2; }
261 int start = time(NULL);
263 string thisOutputDir = outputDir;
264 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
265 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.fasta";
266 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
268 string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "pcr.scrap.fasta";
272 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
273 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
275 vector<unsigned long long> positions;
276 int numFastaSeqs = 0;
277 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
278 positions = m->divideFile(fastafile, processors);
279 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
281 if (processors == 1) {
282 lines.push_back(linePair(0, 1000));
284 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
285 if (positions.size() < processors) { processors = positions.size(); }
287 //figure out how many sequences you have to process
288 int numSeqsPerProcessor = numFastaSeqs / processors;
289 for (int i = 0; i < processors; i++) {
290 int startIndex = i * numSeqsPerProcessor;
291 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
292 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
296 if (m->control_pressed) { return 0; }
298 set<string> badNames;
299 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
300 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
302 if (m->control_pressed) { return 0; }
304 //don't write or keep if blank
305 if (badNames.size() != 0) { writeAccnos(badNames); }
306 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
307 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
309 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
310 if (namefile != "") { readName(badNames); }
311 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
312 if (groupfile != "") { readGroup(badNames); }
313 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
314 if (taxfile != "") { readTax(badNames); }
315 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
317 m->mothurOutEndLine();
318 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
319 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
320 m->mothurOutEndLine();
321 m->mothurOutEndLine();
323 //set fasta file as new current fastafile
325 itTypes = outputTypes.find("fasta");
326 if (itTypes != outputTypes.end()) {
327 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
330 itTypes = outputTypes.find("name");
331 if (itTypes != outputTypes.end()) {
332 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
335 itTypes = outputTypes.find("group");
336 if (itTypes != outputTypes.end()) {
337 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
340 itTypes = outputTypes.find("accnos");
341 if (itTypes != outputTypes.end()) {
342 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
345 itTypes = outputTypes.find("taxonomy");
346 if (itTypes != outputTypes.end()) {
347 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
350 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
351 m->mothurOutEndLine();
357 catch(exception& e) {
358 m->errorOut(e, "PcrSeqsCommand", "execute");
362 /**************************************************************************************************/
363 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
366 vector<int> processIDS;
370 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
372 //loop through and create all the processes you want
373 while (process != processors) {
377 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
380 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
382 //pass numSeqs to parent
384 string tempFile = filename + toString(getpid()) + ".num.temp";
385 m->openOutputFile(tempFile, out);
386 out << num << '\t' << badSeqNames.size() << endl;
387 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
388 out << (*it) << endl;
394 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
395 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
400 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
402 //force parent to wait until all the processes are done
403 for (int i=0;i<processIDS.size();i++) {
404 int temp = processIDS[i];
408 for (int i = 0; i < processIDS.size(); i++) {
410 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
411 m->openInputFile(tempFile, in);
412 int numBadNames = 0; string name = "";
413 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
414 for (int j = 0; j < numBadNames; j++) {
415 in >> name; m->gobble(in);
416 badSeqNames.insert(name);
418 in.close(); m->mothurRemove(tempFile);
420 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
421 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
423 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
424 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
428 //////////////////////////////////////////////////////////////////////////////////////////////////////
429 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
430 //Above fork() will clone, so memory is separate, but that's not the case with windows,
431 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
432 //////////////////////////////////////////////////////////////////////////////////////////////////////
434 vector<pcrData*> pDataArray;
435 DWORD dwThreadIdArray[processors-1];
436 HANDLE hThreadArray[processors-1];
438 //Create processor worker threads.
439 for( int i=0; i<processors-1; i++ ){
441 string extension = "";
442 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
444 // Allocate memory for thread data.
445 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
446 pDataArray.push_back(tempPcr);
448 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
449 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
453 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
454 processIDS.push_back(processors-1);
456 //Wait until all threads have terminated.
457 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
459 //Close all thread handles and free memory allocations.
460 for(int i=0; i < pDataArray.size(); i++){
461 num += pDataArray[i]->count;
462 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
463 CloseHandle(hThreadArray[i]);
464 delete pDataArray[i];
467 for (int i = 0; i < processIDS.size(); i++) {
468 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
469 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
471 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
472 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
480 catch(exception& e) {
481 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
486 //**********************************************************************************************************************
487 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
490 m->openOutputFile(goodFasta, goodFile);
493 m->openOutputFile(badFasta, badFile);
496 m->openInputFile(filename, inFASTA);
498 inFASTA.seekg(filePos.start);
506 if (m->control_pressed) { break; }
508 Sequence currSeq(inFASTA); m->gobble(inFASTA);
510 string trashCode = "";
511 if (currSeq.getName() != "") {
514 if (oligosfile != "") {
515 map<int, int> mapAligned;
516 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
519 if (primers.size() != 0) {
520 int primerStart = 0; int primerEnd = 0;
521 bool good = findForward(currSeq, primerStart, primerEnd);
523 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
528 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
529 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
532 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
533 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
536 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
537 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
542 //process reverse primers
543 if (revPrimer.size() != 0) {
544 int primerStart = 0; int primerEnd = 0;
545 bool good = findReverse(currSeq, primerStart, primerEnd);
546 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
551 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
552 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
555 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
556 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
560 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
561 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
565 }else if (ecolifile != "") {
566 //make sure the seqs are aligned
567 lengths.insert(currSeq.getAligned().length());
568 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
569 else if (currSeq.getAligned().length() != length) {
570 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
573 currSeq.filterToPos(start);
574 currSeq.filterFromPos(end);
576 string seqString = currSeq.getAligned().substr(0, end);
577 seqString = seqString.substr(start);
578 currSeq.setAligned(seqString);
581 }else{ //using start and end to trim
582 //make sure the seqs are aligned
583 lengths.insert(currSeq.getAligned().length());
584 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
587 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
589 if (keepdots) { currSeq.filterFromPos(end); }
591 string seqString = currSeq.getAligned().substr(0, end);
592 currSeq.setAligned(seqString);
597 if (keepdots) { currSeq.filterToPos(start); }
599 string seqString = currSeq.getAligned().substr(start);
600 currSeq.setAligned(seqString);
606 //trimming removed all bases
607 if (currSeq.getUnaligned() == "") { goodSeq = false; }
609 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
611 badSeqNames.insert(currSeq.getName());
612 currSeq.setName(currSeq.getName() + '|' + trashCode);
613 currSeq.printSequence(badFile);
618 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
619 unsigned long long pos = inFASTA.tellg();
620 if ((pos == -1) || (pos >= filePos.end)) { break; }
622 if (inFASTA.eof()) { break; }
626 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
629 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
637 catch(exception& e) {
638 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
642 //********************************************************************/
643 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
646 string rawSequence = seq.getUnaligned();
648 for(int j=0;j<primers.size();j++){
649 string oligo = primers[j];
651 if(rawSequence.length() < oligo.length()) { break; }
654 int olength = oligo.length();
655 for (int j = 0; j < rawSequence.length()-olength; j++){
656 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
657 string rawChunk = rawSequence.substr(j, olength);
658 if(compareDNASeq(oligo, rawChunk)) {
660 primerEnd = primerStart + olength;
667 primerStart = 0; primerEnd = 0;
671 catch(exception& e) {
672 m->errorOut(e, "TrimOligos", "stripForward");
676 //******************************************************************/
677 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
680 string rawSequence = seq.getUnaligned();
682 for(int i=0;i<revPrimer.size();i++){
683 string oligo = revPrimer[i];
684 if(rawSequence.length() < oligo.length()) { break; }
687 int olength = oligo.length();
688 for (int j = rawSequence.length()-olength; j >= 0; j--){
689 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
690 string rawChunk = rawSequence.substr(j, olength);
692 if(compareDNASeq(oligo, rawChunk)) {
694 primerEnd = primerStart + olength;
701 primerStart = 0; primerEnd = 0;
704 catch(exception& e) {
705 m->errorOut(e, "PcrSeqsCommand", "findReverse");
709 //********************************************************************/
710 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
712 bool isAligned = false;
715 for (int i = 0; i < seq.length(); i++) {
716 if (!isalpha(seq[i])) { isAligned = true; }
717 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
718 } //ie. the 3rd base may be at spot 10 in the alignment
719 //later when we trim we want to trim from spot 10.
722 catch(exception& e) {
723 m->errorOut(e, "PcrSeqsCommand", "isAligned");
727 //********************************************************************/
728 string PcrSeqsCommand::reverseOligo(string oligo){
732 for(int i=oligo.length()-1;i>=0;i--){
734 if(oligo[i] == 'A') { reverse += 'T'; }
735 else if(oligo[i] == 'T'){ reverse += 'A'; }
736 else if(oligo[i] == 'U'){ reverse += 'A'; }
738 else if(oligo[i] == 'G'){ reverse += 'C'; }
739 else if(oligo[i] == 'C'){ reverse += 'G'; }
741 else if(oligo[i] == 'R'){ reverse += 'Y'; }
742 else if(oligo[i] == 'Y'){ reverse += 'R'; }
744 else if(oligo[i] == 'M'){ reverse += 'K'; }
745 else if(oligo[i] == 'K'){ reverse += 'M'; }
747 else if(oligo[i] == 'W'){ reverse += 'W'; }
748 else if(oligo[i] == 'S'){ reverse += 'S'; }
750 else if(oligo[i] == 'B'){ reverse += 'V'; }
751 else if(oligo[i] == 'V'){ reverse += 'B'; }
753 else if(oligo[i] == 'D'){ reverse += 'H'; }
754 else if(oligo[i] == 'H'){ reverse += 'D'; }
756 else { reverse += 'N'; }
762 catch(exception& e) {
763 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
768 //***************************************************************************************************************
769 bool PcrSeqsCommand::readOligos(){
772 m->openInputFile(oligosfile, inOligos);
774 string type, oligo, group;
776 while(!inOligos.eof()){
780 if(type[0] == '#'){ //ignore
781 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
785 //make type case insensitive
786 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
790 for(int i=0;i<oligo.length();i++){
791 oligo[i] = toupper(oligo[i]);
792 if(oligo[i] == 'U') { oligo[i] = 'T'; }
795 if(type == "FORWARD"){
796 // get rest of line in case there is a primer name
797 while (!inOligos.eof()) {
798 char c = inOligos.get();
799 if (c == 10 || c == 13){ break; }
800 else if (c == 32 || c == 9){;} //space or tab
802 primers.push_back(oligo);
803 }else if(type == "REVERSE"){
804 string oligoRC = reverseOligo(oligo);
805 revPrimer.push_back(oligoRC);
806 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
807 }else if(type == "BARCODE"){
809 }else if((type == "LINKER")||(type == "SPACER")) {;}
810 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
816 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
817 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
818 m->control_pressed = true;
824 }catch(exception& e) {
825 m->errorOut(e, "PcrSeqsCommand", "readOligos");
829 //***************************************************************************************************************
830 bool PcrSeqsCommand::readEcoli(){
833 m->openInputFile(ecolifile, in);
838 length = ecoli.getAligned().length();
839 start = ecoli.getStartPos();
840 end = ecoli.getEndPos();
841 }else { in.close(); m->control_pressed = true; return false; }
846 catch(exception& e) {
847 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
852 //***************************************************************************************************************
853 int PcrSeqsCommand::writeAccnos(set<string> badNames){
855 string thisOutputDir = outputDir;
856 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
857 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "bad.accnos";
858 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
861 m->openOutputFile(outputFileName, out);
863 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
864 if (m->control_pressed) { break; }
865 out << (*it) << endl;
871 catch(exception& e) {
872 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
877 //******************************************************************/
878 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
881 int length = oligo.length();
883 for(int i=0;i<length;i++){
885 if(oligo[i] != seq[i]){
886 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
887 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
888 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
889 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
890 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
891 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
892 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
893 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
894 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
895 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
896 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
897 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
899 if(success == 0) { break; }
908 catch(exception& e) {
909 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
914 //***************************************************************************************************************
915 int PcrSeqsCommand::readName(set<string>& names){
917 string thisOutputDir = outputDir;
918 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
919 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + "pcr" + m->getExtension(namefile);
922 m->openOutputFile(outputFileName, out);
925 m->openInputFile(namefile, in);
926 string name, firstCol, secondCol;
928 bool wroteSomething = false;
929 int removedCount = 0;
932 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
934 in >> firstCol; m->gobble(in);
937 string savedSecond = secondCol;
938 vector<string> parsedNames;
939 m->splitAtComma(secondCol, parsedNames);
941 vector<string> validSecond; validSecond.clear();
942 for (int i = 0; i < parsedNames.size(); i++) {
943 if (names.count(parsedNames[i]) == 0) {
944 validSecond.push_back(parsedNames[i]);
948 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
949 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
950 removedCount += parsedNames.size();
952 out << firstCol << '\t' << savedSecond << endl;
953 wroteSomething = true;
960 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
961 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
963 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
967 catch(exception& e) {
968 m->errorOut(e, "PcrSeqsCommand", "readName");
972 //**********************************************************************************************************************
973 int PcrSeqsCommand::readGroup(set<string> names){
975 string thisOutputDir = outputDir;
976 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
977 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + "pcr" + m->getExtension(groupfile);
980 m->openOutputFile(outputFileName, out);
983 m->openInputFile(groupfile, in);
986 bool wroteSomething = false;
987 int removedCount = 0;
990 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
992 in >> name; //read from first column
993 in >> group; //read from second column
995 //if this name is in the accnos file
996 if (names.count(name) == 0) {
997 wroteSomething = true;
998 out << name << '\t' << group << endl;
999 }else { removedCount++; }
1006 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1007 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1009 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1014 catch(exception& e) {
1015 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1019 //**********************************************************************************************************************
1020 int PcrSeqsCommand::readTax(set<string> names){
1022 string thisOutputDir = outputDir;
1023 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1024 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + "pcr" + m->getExtension(taxfile);
1026 m->openOutputFile(outputFileName, out);
1029 m->openInputFile(taxfile, in);
1032 bool wroteSomething = false;
1033 int removedCount = 0;
1036 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1038 in >> name; //read from first column
1039 in >> tax; //read from second column
1041 //if this name is in the accnos file
1042 if (names.count(name) == 0) {
1043 wroteSomething = true;
1044 out << name << '\t' << tax << endl;
1045 }else { removedCount++; }
1052 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1053 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1055 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1059 catch(exception& e) {
1060 m->errorOut(e, "PcrSeqsCommand", "readTax");
1064 /**************************************************************************************/