5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
25 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
26 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
27 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
28 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
30 vector<string> myArray;
31 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
35 m->errorOut(e, "PcrSeqsCommand", "setParameters");
39 //**********************************************************************************************************************
40 string PcrSeqsCommand::getHelpString(){
42 string helpString = "";
43 helpString += "The pcr.seqs command reads a fasta file.\n";
44 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
45 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
46 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
47 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
48 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
49 helpString += "The processors parameter allows you to use multiple processors.\n";
50 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
51 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
52 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
53 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
57 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
61 //**********************************************************************************************************************
62 string PcrSeqsCommand::getOutputPattern(string type) {
66 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
67 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
68 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
69 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
70 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
72 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
77 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
81 //**********************************************************************************************************************
83 PcrSeqsCommand::PcrSeqsCommand(){
85 abort = true; calledHelp = true;
87 vector<string> tempOutNames;
88 outputTypes["fasta"] = tempOutNames;
89 outputTypes["taxonomy"] = tempOutNames;
90 outputTypes["group"] = tempOutNames;
91 outputTypes["name"] = tempOutNames;
92 outputTypes["count"] = tempOutNames;
93 outputTypes["accnos"] = tempOutNames;
96 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
100 //***************************************************************************************************************
102 PcrSeqsCommand::PcrSeqsCommand(string option) {
105 abort = false; calledHelp = false;
107 //allow user to run help
108 if(option == "help") { help(); abort = true; calledHelp = true; }
109 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
112 vector<string> myArray = setParameters();
114 OptionParser parser(option);
115 map<string,string> parameters = parser.getParameters();
117 ValidParameters validParameter;
118 map<string,string>::iterator it;
120 //check to make sure all parameters are valid for command
121 for (it = parameters.begin(); it != parameters.end(); it++) {
122 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
125 //initialize outputTypes
126 vector<string> tempOutNames;
127 outputTypes["fasta"] = tempOutNames;
128 outputTypes["taxonomy"] = tempOutNames;
129 outputTypes["group"] = tempOutNames;
130 outputTypes["name"] = tempOutNames;
131 outputTypes["accnos"] = tempOutNames;
132 outputTypes["count"] = tempOutNames;
134 //if the user changes the input directory command factory will send this info to us in the output parameter
135 string inputDir = validParameter.validFile(parameters, "inputdir", false);
136 if (inputDir == "not found"){ inputDir = ""; }
139 it = parameters.find("fasta");
140 //user has given a template file
141 if(it != parameters.end()){
142 path = m->hasPath(it->second);
143 //if the user has not given a path then, add inputdir. else leave path alone.
144 if (path == "") { parameters["fasta"] = inputDir + it->second; }
147 it = parameters.find("oligos");
148 //user has given a template file
149 if(it != parameters.end()){
150 path = m->hasPath(it->second);
151 //if the user has not given a path then, add inputdir. else leave path alone.
152 if (path == "") { parameters["oligos"] = inputDir + it->second; }
155 it = parameters.find("ecoli");
156 //user has given a template file
157 if(it != parameters.end()){
158 path = m->hasPath(it->second);
159 //if the user has not given a path then, add inputdir. else leave path alone.
160 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
163 it = parameters.find("taxonomy");
164 //user has given a template file
165 if(it != parameters.end()){
166 path = m->hasPath(it->second);
167 //if the user has not given a path then, add inputdir. else leave path alone.
168 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
171 it = parameters.find("name");
172 //user has given a template file
173 if(it != parameters.end()){
174 path = m->hasPath(it->second);
175 //if the user has not given a path then, add inputdir. else leave path alone.
176 if (path == "") { parameters["name"] = inputDir + it->second; }
179 it = parameters.find("group");
180 //user has given a template file
181 if(it != parameters.end()){
182 path = m->hasPath(it->second);
183 //if the user has not given a path then, add inputdir. else leave path alone.
184 if (path == "") { parameters["group"] = inputDir + it->second; }
187 it = parameters.find("count");
188 //user has given a template file
189 if(it != parameters.end()){
190 path = m->hasPath(it->second);
191 //if the user has not given a path then, add inputdir. else leave path alone.
192 if (path == "") { parameters["count"] = inputDir + it->second; }
198 //check for required parameters
199 fastafile = validParameter.validFile(parameters, "fasta", true);
200 if (fastafile == "not found") {
201 fastafile = m->getFastaFile();
202 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
203 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
204 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
205 else { m->setFastaFile(fastafile); }
207 //if the user changes the output directory command factory will send this info to us in the output parameter
208 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
210 //check for optional parameter and set defaults
211 // ...at some point should added some additional type checking...
213 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
214 keepprimer = m->isTrue(temp);
216 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
217 keepdots = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "oligos", true);
220 if (temp == "not found"){ oligosfile = ""; }
221 else if(temp == "not open"){ oligosfile = ""; abort = true; }
222 else { oligosfile = temp; m->setOligosFile(oligosfile); }
224 ecolifile = validParameter.validFile(parameters, "ecoli", true);
225 if (ecolifile == "not found"){ ecolifile = ""; }
226 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
228 namefile = validParameter.validFile(parameters, "name", true);
229 if (namefile == "not found"){ namefile = ""; }
230 else if(namefile == "not open"){ namefile = ""; abort = true; }
231 else { m->setNameFile(namefile); }
233 groupfile = validParameter.validFile(parameters, "group", true);
234 if (groupfile == "not found"){ groupfile = ""; }
235 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
236 else { m->setGroupFile(groupfile); }
238 countfile = validParameter.validFile(parameters, "count", true);
239 if (countfile == "not open") { countfile = ""; abort = true; }
240 else if (countfile == "not found") { countfile = ""; }
241 else { m->setCountTableFile(countfile); }
243 if ((namefile != "") && (countfile != "")) {
244 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
247 if ((groupfile != "") && (countfile != "")) {
248 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
251 taxfile = validParameter.validFile(parameters, "taxonomy", true);
252 if (taxfile == "not found"){ taxfile = ""; }
253 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
254 else { m->setTaxonomyFile(taxfile); }
256 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
257 m->mothurConvert(temp, start);
259 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, end);
262 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
263 m->setProcessors(temp);
264 m->mothurConvert(temp, processors);
266 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
268 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
271 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
272 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
275 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
277 if ((ecolifile != "") && (start != -1) && (end != -1)) {
278 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
282 if ((oligosfile != "") && (ecolifile != "")) {
283 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
286 //check to make sure you didn't forget the name file by mistake
287 if (countfile == "") {
288 if (namefile == "") {
289 vector<string> files; files.push_back(fastafile);
290 parser.getNameFile(files);
296 catch(exception& e) {
297 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
301 //***************************************************************************************************************
303 int PcrSeqsCommand::execute(){
306 if (abort == true) { if (calledHelp) { return 0; } return 2; }
308 int start = time(NULL);
310 string thisOutputDir = outputDir;
311 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
312 map<string, string> variables;
313 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
314 variables["[extension]"] = m->getExtension(fastafile);
315 string trimSeqFile = getOutputFileName("fasta",variables);
316 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
317 variables["[tag]"] = "scrap";
318 string badSeqFile = getOutputFileName("fasta",variables);
322 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
323 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
325 vector<unsigned long long> positions;
326 int numFastaSeqs = 0;
327 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
328 positions = m->divideFile(fastafile, processors);
329 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
331 if (processors == 1) {
332 lines.push_back(linePair(0, 1000));
334 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
335 if (positions.size() < processors) { processors = positions.size(); }
337 //figure out how many sequences you have to process
338 int numSeqsPerProcessor = numFastaSeqs / processors;
339 for (int i = 0; i < processors; i++) {
340 int startIndex = i * numSeqsPerProcessor;
341 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
342 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
346 if (m->control_pressed) { return 0; }
348 set<string> badNames;
349 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
350 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
352 if (m->control_pressed) { return 0; }
354 //don't write or keep if blank
355 if (badNames.size() != 0) { writeAccnos(badNames); }
356 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
357 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
359 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
360 if (namefile != "") { readName(badNames); }
361 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
362 if (groupfile != "") { readGroup(badNames); }
363 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
364 if (taxfile != "") { readTax(badNames); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (countfile != "") { readCount(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
369 m->mothurOutEndLine();
370 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
371 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
372 m->mothurOutEndLine();
373 m->mothurOutEndLine();
375 //set fasta file as new current fastafile
377 itTypes = outputTypes.find("fasta");
378 if (itTypes != outputTypes.end()) {
379 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
382 itTypes = outputTypes.find("name");
383 if (itTypes != outputTypes.end()) {
384 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
387 itTypes = outputTypes.find("group");
388 if (itTypes != outputTypes.end()) {
389 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
392 itTypes = outputTypes.find("accnos");
393 if (itTypes != outputTypes.end()) {
394 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
397 itTypes = outputTypes.find("taxonomy");
398 if (itTypes != outputTypes.end()) {
399 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
402 itTypes = outputTypes.find("count");
403 if (itTypes != outputTypes.end()) {
404 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
407 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
408 m->mothurOutEndLine();
414 catch(exception& e) {
415 m->errorOut(e, "PcrSeqsCommand", "execute");
419 /**************************************************************************************************/
420 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
423 vector<int> processIDS;
427 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
429 //loop through and create all the processes you want
430 while (process != processors) {
434 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
437 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
439 //pass numSeqs to parent
441 string tempFile = filename + toString(getpid()) + ".num.temp";
442 m->openOutputFile(tempFile, out);
443 out << num << '\t' << badSeqNames.size() << endl;
444 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
445 out << (*it) << endl;
451 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
452 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
457 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
459 //force parent to wait until all the processes are done
460 for (int i=0;i<processIDS.size();i++) {
461 int temp = processIDS[i];
465 for (int i = 0; i < processIDS.size(); i++) {
467 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
468 m->openInputFile(tempFile, in);
469 int numBadNames = 0; string name = "";
470 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
471 for (int j = 0; j < numBadNames; j++) {
472 in >> name; m->gobble(in);
473 badSeqNames.insert(name);
475 in.close(); m->mothurRemove(tempFile);
477 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
478 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
480 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
481 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
485 //////////////////////////////////////////////////////////////////////////////////////////////////////
486 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
487 //Above fork() will clone, so memory is separate, but that's not the case with windows,
488 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
489 //////////////////////////////////////////////////////////////////////////////////////////////////////
491 vector<pcrData*> pDataArray;
492 DWORD dwThreadIdArray[processors-1];
493 HANDLE hThreadArray[processors-1];
495 //Create processor worker threads.
496 for( int i=0; i<processors-1; i++ ){
498 string extension = "";
499 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
501 // Allocate memory for thread data.
502 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
503 pDataArray.push_back(tempPcr);
505 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
506 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
510 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
511 processIDS.push_back(processors-1);
513 //Wait until all threads have terminated.
514 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
516 //Close all thread handles and free memory allocations.
517 for(int i=0; i < pDataArray.size(); i++){
518 num += pDataArray[i]->count;
519 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
520 CloseHandle(hThreadArray[i]);
521 delete pDataArray[i];
524 for (int i = 0; i < processIDS.size(); i++) {
525 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
526 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
528 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
529 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
537 catch(exception& e) {
538 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
543 //**********************************************************************************************************************
544 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
547 m->openOutputFile(goodFasta, goodFile);
550 m->openOutputFile(badFasta, badFile);
553 m->openInputFile(filename, inFASTA);
555 inFASTA.seekg(filePos.start);
563 if (m->control_pressed) { break; }
565 Sequence currSeq(inFASTA); m->gobble(inFASTA);
567 string trashCode = "";
568 if (currSeq.getName() != "") {
571 if (oligosfile != "") {
572 map<int, int> mapAligned;
573 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
576 if (primers.size() != 0) {
577 int primerStart = 0; int primerEnd = 0;
578 bool good = findForward(currSeq, primerStart, primerEnd);
580 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
585 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
586 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
589 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
590 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
593 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
594 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
599 //process reverse primers
600 if (revPrimer.size() != 0) {
601 int primerStart = 0; int primerEnd = 0;
602 bool good = findReverse(currSeq, primerStart, primerEnd);
603 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
608 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
609 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
612 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
613 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
617 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
618 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
622 }else if (ecolifile != "") {
623 //make sure the seqs are aligned
624 lengths.insert(currSeq.getAligned().length());
625 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
626 else if (currSeq.getAligned().length() != length) {
627 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
630 currSeq.filterToPos(start);
631 currSeq.filterFromPos(end);
633 string seqString = currSeq.getAligned().substr(0, end);
634 seqString = seqString.substr(start);
635 currSeq.setAligned(seqString);
638 }else{ //using start and end to trim
639 //make sure the seqs are aligned
640 lengths.insert(currSeq.getAligned().length());
641 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
644 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
646 if (keepdots) { currSeq.filterFromPos(end); }
648 string seqString = currSeq.getAligned().substr(0, end);
649 currSeq.setAligned(seqString);
654 if (keepdots) { currSeq.filterToPos(start); }
656 string seqString = currSeq.getAligned().substr(start);
657 currSeq.setAligned(seqString);
663 //trimming removed all bases
664 if (currSeq.getUnaligned() == "") { goodSeq = false; }
666 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
668 badSeqNames.insert(currSeq.getName());
669 currSeq.setName(currSeq.getName() + '|' + trashCode);
670 currSeq.printSequence(badFile);
675 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
676 unsigned long long pos = inFASTA.tellg();
677 if ((pos == -1) || (pos >= filePos.end)) { break; }
679 if (inFASTA.eof()) { break; }
683 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
686 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
694 catch(exception& e) {
695 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
699 //********************************************************************/
700 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
703 string rawSequence = seq.getUnaligned();
705 for(int j=0;j<primers.size();j++){
706 string oligo = primers[j];
708 if(rawSequence.length() < oligo.length()) { break; }
711 int olength = oligo.length();
712 for (int j = 0; j < rawSequence.length()-olength; j++){
713 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
714 string rawChunk = rawSequence.substr(j, olength);
715 if(compareDNASeq(oligo, rawChunk)) {
717 primerEnd = primerStart + olength;
724 primerStart = 0; primerEnd = 0;
728 catch(exception& e) {
729 m->errorOut(e, "TrimOligos", "stripForward");
733 //******************************************************************/
734 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
737 string rawSequence = seq.getUnaligned();
739 for(int i=0;i<revPrimer.size();i++){
740 string oligo = revPrimer[i];
741 if(rawSequence.length() < oligo.length()) { break; }
744 int olength = oligo.length();
745 for (int j = rawSequence.length()-olength; j >= 0; j--){
746 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
747 string rawChunk = rawSequence.substr(j, olength);
749 if(compareDNASeq(oligo, rawChunk)) {
751 primerEnd = primerStart + olength;
758 primerStart = 0; primerEnd = 0;
761 catch(exception& e) {
762 m->errorOut(e, "PcrSeqsCommand", "findReverse");
766 //********************************************************************/
767 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
769 bool isAligned = false;
772 for (int i = 0; i < seq.length(); i++) {
773 if (!isalpha(seq[i])) { isAligned = true; }
774 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
775 } //ie. the 3rd base may be at spot 10 in the alignment
776 //later when we trim we want to trim from spot 10.
779 catch(exception& e) {
780 m->errorOut(e, "PcrSeqsCommand", "isAligned");
784 //********************************************************************/
785 string PcrSeqsCommand::reverseOligo(string oligo){
789 for(int i=oligo.length()-1;i>=0;i--){
791 if(oligo[i] == 'A') { reverse += 'T'; }
792 else if(oligo[i] == 'T'){ reverse += 'A'; }
793 else if(oligo[i] == 'U'){ reverse += 'A'; }
795 else if(oligo[i] == 'G'){ reverse += 'C'; }
796 else if(oligo[i] == 'C'){ reverse += 'G'; }
798 else if(oligo[i] == 'R'){ reverse += 'Y'; }
799 else if(oligo[i] == 'Y'){ reverse += 'R'; }
801 else if(oligo[i] == 'M'){ reverse += 'K'; }
802 else if(oligo[i] == 'K'){ reverse += 'M'; }
804 else if(oligo[i] == 'W'){ reverse += 'W'; }
805 else if(oligo[i] == 'S'){ reverse += 'S'; }
807 else if(oligo[i] == 'B'){ reverse += 'V'; }
808 else if(oligo[i] == 'V'){ reverse += 'B'; }
810 else if(oligo[i] == 'D'){ reverse += 'H'; }
811 else if(oligo[i] == 'H'){ reverse += 'D'; }
813 else { reverse += 'N'; }
819 catch(exception& e) {
820 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
825 //***************************************************************************************************************
826 bool PcrSeqsCommand::readOligos(){
829 m->openInputFile(oligosfile, inOligos);
831 string type, oligo, group;
833 while(!inOligos.eof()){
837 if(type[0] == '#'){ //ignore
838 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
842 //make type case insensitive
843 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
847 for(int i=0;i<oligo.length();i++){
848 oligo[i] = toupper(oligo[i]);
849 if(oligo[i] == 'U') { oligo[i] = 'T'; }
852 if(type == "FORWARD"){
853 // get rest of line in case there is a primer name
854 while (!inOligos.eof()) {
855 char c = inOligos.get();
856 if (c == 10 || c == 13){ break; }
857 else if (c == 32 || c == 9){;} //space or tab
859 primers.push_back(oligo);
860 }else if(type == "REVERSE"){
861 string oligoRC = reverseOligo(oligo);
862 revPrimer.push_back(oligoRC);
863 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
864 }else if(type == "BARCODE"){
866 }else if((type == "LINKER")||(type == "SPACER")) {;}
867 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
873 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
874 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
875 m->control_pressed = true;
881 }catch(exception& e) {
882 m->errorOut(e, "PcrSeqsCommand", "readOligos");
886 //***************************************************************************************************************
887 bool PcrSeqsCommand::readEcoli(){
890 m->openInputFile(ecolifile, in);
895 length = ecoli.getAligned().length();
896 start = ecoli.getStartPos();
897 end = ecoli.getEndPos();
898 }else { in.close(); m->control_pressed = true; return false; }
903 catch(exception& e) {
904 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
909 //***************************************************************************************************************
910 int PcrSeqsCommand::writeAccnos(set<string> badNames){
912 string thisOutputDir = outputDir;
913 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
914 map<string, string> variables;
915 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
916 string outputFileName = getOutputFileName("accnos",variables);
917 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
920 m->openOutputFile(outputFileName, out);
922 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
923 if (m->control_pressed) { break; }
924 out << (*it) << endl;
930 catch(exception& e) {
931 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
936 //******************************************************************/
937 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
940 int length = oligo.length();
942 for(int i=0;i<length;i++){
944 if(oligo[i] != seq[i]){
945 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
946 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
947 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
948 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
949 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
950 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
951 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
952 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
953 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
954 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
955 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
956 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
958 if(success == 0) { break; }
967 catch(exception& e) {
968 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
973 //***************************************************************************************************************
974 int PcrSeqsCommand::readName(set<string>& names){
976 string thisOutputDir = outputDir;
977 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
978 map<string, string> variables;
979 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
980 variables["[extension]"] = m->getExtension(namefile);
981 string outputFileName = getOutputFileName("name", variables);
984 m->openOutputFile(outputFileName, out);
987 m->openInputFile(namefile, in);
988 string name, firstCol, secondCol;
990 bool wroteSomething = false;
991 int removedCount = 0;
994 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
996 in >> firstCol; m->gobble(in);
999 string savedSecond = secondCol;
1000 vector<string> parsedNames;
1001 m->splitAtComma(secondCol, parsedNames);
1003 vector<string> validSecond; validSecond.clear();
1004 for (int i = 0; i < parsedNames.size(); i++) {
1005 if (names.count(parsedNames[i]) == 0) {
1006 validSecond.push_back(parsedNames[i]);
1010 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1011 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1012 removedCount += parsedNames.size();
1014 out << firstCol << '\t' << savedSecond << endl;
1015 wroteSomething = true;
1022 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1023 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1025 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1029 catch(exception& e) {
1030 m->errorOut(e, "PcrSeqsCommand", "readName");
1034 //**********************************************************************************************************************
1035 int PcrSeqsCommand::readGroup(set<string> names){
1037 string thisOutputDir = outputDir;
1038 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1039 map<string, string> variables;
1040 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1041 variables["[extension]"] = m->getExtension(groupfile);
1042 string outputFileName = getOutputFileName("group", variables);
1045 m->openOutputFile(outputFileName, out);
1048 m->openInputFile(groupfile, in);
1051 bool wroteSomething = false;
1052 int removedCount = 0;
1055 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1057 in >> name; //read from first column
1058 in >> group; //read from second column
1060 //if this name is in the accnos file
1061 if (names.count(name) == 0) {
1062 wroteSomething = true;
1063 out << name << '\t' << group << endl;
1064 }else { removedCount++; }
1071 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1072 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1074 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1079 catch(exception& e) {
1080 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1084 //**********************************************************************************************************************
1085 int PcrSeqsCommand::readTax(set<string> names){
1087 string thisOutputDir = outputDir;
1088 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1089 map<string, string> variables;
1090 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1091 variables["[extension]"] = m->getExtension(taxfile);
1092 string outputFileName = getOutputFileName("taxonomy", variables);
1095 m->openOutputFile(outputFileName, out);
1098 m->openInputFile(taxfile, in);
1101 bool wroteSomething = false;
1102 int removedCount = 0;
1105 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1107 in >> name; //read from first column
1108 in >> tax; //read from second column
1110 //if this name is in the accnos file
1111 if (names.count(name) == 0) {
1112 wroteSomething = true;
1113 out << name << '\t' << tax << endl;
1114 }else { removedCount++; }
1121 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1122 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1124 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1128 catch(exception& e) {
1129 m->errorOut(e, "PcrSeqsCommand", "readTax");
1133 //***************************************************************************************************************
1134 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1137 m->openInputFile(countfile, in);
1138 set<string>::iterator it;
1140 map<string, string> variables;
1141 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1142 variables["[extension]"] = m->getExtension(countfile);
1143 string goodCountFile = getOutputFileName("count", variables);
1145 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1146 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1148 string headers = m->getline(in); m->gobble(in);
1149 goodCountOut << headers << endl;
1151 string name, rest; int thisTotal, removedCount; removedCount = 0;
1152 bool wroteSomething = false;
1155 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1157 in >> name; m->gobble(in);
1158 in >> thisTotal; m->gobble(in);
1159 rest = m->getline(in); m->gobble(in);
1161 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1163 wroteSomething = true;
1164 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1168 goodCountOut.close();
1170 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1172 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1174 //check for groups that have been eliminated
1176 if (ct.testGroups(goodCountFile)) {
1177 ct.readTable(goodCountFile);
1178 ct.printTable(goodCountFile);
1181 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1183 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1189 catch(exception& e) {
1190 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1194 /**************************************************************************************/