5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
28 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "PcrSeqsCommand", "setParameters");
41 //**********************************************************************************************************************
42 string PcrSeqsCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The pcr.seqs command reads a fasta file.\n";
46 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n";
47 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
48 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
49 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
50 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
51 helpString += "The processors parameter allows you to use multiple processors.\n";
52 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
53 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
54 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
55 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
56 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
60 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string PcrSeqsCommand::getOutputPattern(string type) {
69 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
70 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
72 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
73 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
74 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
80 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
84 //**********************************************************************************************************************
86 PcrSeqsCommand::PcrSeqsCommand(){
88 abort = true; calledHelp = true;
90 vector<string> tempOutNames;
91 outputTypes["fasta"] = tempOutNames;
92 outputTypes["taxonomy"] = tempOutNames;
93 outputTypes["group"] = tempOutNames;
94 outputTypes["name"] = tempOutNames;
95 outputTypes["count"] = tempOutNames;
96 outputTypes["accnos"] = tempOutNames;
99 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
103 //***************************************************************************************************************
105 PcrSeqsCommand::PcrSeqsCommand(string option) {
108 abort = false; calledHelp = false;
110 //allow user to run help
111 if(option == "help") { help(); abort = true; calledHelp = true; }
112 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string,string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string,string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["fasta"] = tempOutNames;
131 outputTypes["taxonomy"] = tempOutNames;
132 outputTypes["group"] = tempOutNames;
133 outputTypes["name"] = tempOutNames;
134 outputTypes["accnos"] = tempOutNames;
135 outputTypes["count"] = tempOutNames;
137 //if the user changes the input directory command factory will send this info to us in the output parameter
138 string inputDir = validParameter.validFile(parameters, "inputdir", false);
139 if (inputDir == "not found"){ inputDir = ""; }
142 it = parameters.find("fasta");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["fasta"] = inputDir + it->second; }
150 it = parameters.find("oligos");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["oligos"] = inputDir + it->second; }
158 it = parameters.find("ecoli");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
166 it = parameters.find("taxonomy");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
182 it = parameters.find("group");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["group"] = inputDir + it->second; }
190 it = parameters.find("count");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["count"] = inputDir + it->second; }
201 //check for required parameters
202 fastafile = validParameter.validFile(parameters, "fasta", true);
203 if (fastafile == "not found") {
204 fastafile = m->getFastaFile();
205 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
206 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
207 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
208 else { m->setFastaFile(fastafile); }
210 //if the user changes the output directory command factory will send this info to us in the output parameter
211 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
213 //check for optional parameter and set defaults
214 // ...at some point should added some additional type checking...
216 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
217 keepprimer = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
220 keepdots = m->isTrue(temp);
222 temp = validParameter.validFile(parameters, "oligos", true);
223 if (temp == "not found"){ oligosfile = ""; }
224 else if(temp == "not open"){ oligosfile = ""; abort = true; }
225 else { oligosfile = temp; m->setOligosFile(oligosfile); }
227 ecolifile = validParameter.validFile(parameters, "ecoli", true);
228 if (ecolifile == "not found"){ ecolifile = ""; }
229 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
231 namefile = validParameter.validFile(parameters, "name", true);
232 if (namefile == "not found"){ namefile = ""; }
233 else if(namefile == "not open"){ namefile = ""; abort = true; }
234 else { m->setNameFile(namefile); }
236 groupfile = validParameter.validFile(parameters, "group", true);
237 if (groupfile == "not found"){ groupfile = ""; }
238 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
239 else { m->setGroupFile(groupfile); }
241 countfile = validParameter.validFile(parameters, "count", true);
242 if (countfile == "not open") { countfile = ""; abort = true; }
243 else if (countfile == "not found") { countfile = ""; }
244 else { m->setCountTableFile(countfile); }
246 if ((namefile != "") && (countfile != "")) {
247 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
250 if ((groupfile != "") && (countfile != "")) {
251 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
254 taxfile = validParameter.validFile(parameters, "taxonomy", true);
255 if (taxfile == "not found"){ taxfile = ""; }
256 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
257 else { m->setTaxonomyFile(taxfile); }
259 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, start);
262 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
263 m->mothurConvert(temp, end);
265 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
266 m->setProcessors(temp);
267 m->mothurConvert(temp, processors);
269 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, pdiffs);
272 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
274 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
277 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
278 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
281 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
283 if ((ecolifile != "") && (start != -1) && (end != -1)) {
284 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
288 if ((oligosfile != "") && (ecolifile != "")) {
289 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
292 //check to make sure you didn't forget the name file by mistake
293 if (countfile == "") {
294 if (namefile == "") {
295 vector<string> files; files.push_back(fastafile);
296 parser.getNameFile(files);
302 catch(exception& e) {
303 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
307 //***************************************************************************************************************
309 int PcrSeqsCommand::execute(){
312 if (abort == true) { if (calledHelp) { return 0; } return 2; }
314 int start = time(NULL);
317 string thisOutputDir = outputDir;
318 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
319 map<string, string> variables;
320 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
321 variables["[extension]"] = m->getExtension(fastafile);
322 string trimSeqFile = getOutputFileName("fasta",variables);
323 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
324 variables["[tag]"] = "scrap";
325 string badSeqFile = getOutputFileName("fasta",variables);
329 if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; }
330 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
332 vector<unsigned long long> positions;
333 int numFastaSeqs = 0;
334 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
335 positions = m->divideFile(fastafile, processors);
336 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
338 if (processors == 1) {
339 lines.push_back(linePair(0, 1000));
341 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
342 if (positions.size() < processors) { processors = positions.size(); }
344 //figure out how many sequences you have to process
345 int numSeqsPerProcessor = numFastaSeqs / processors;
346 for (int i = 0; i < processors; i++) {
347 int startIndex = i * numSeqsPerProcessor;
348 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
349 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
353 if (m->control_pressed) { return 0; }
355 set<string> badNames;
356 numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);
358 if (m->control_pressed) { return 0; }
360 //don't write or keep if blank
361 if (badNames.size() != 0) { writeAccnos(badNames); }
362 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
363 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (namefile != "") { readName(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
368 if (groupfile != "") { readGroup(badNames); }
369 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
370 if (taxfile != "") { readTax(badNames); }
371 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 if (countfile != "") { readCount(badNames); }
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 m->mothurOutEndLine();
376 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
377 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
378 m->mothurOutEndLine();
379 m->mothurOutEndLine();
381 //set fasta file as new current fastafile
383 itTypes = outputTypes.find("fasta");
384 if (itTypes != outputTypes.end()) {
385 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
388 itTypes = outputTypes.find("name");
389 if (itTypes != outputTypes.end()) {
390 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
393 itTypes = outputTypes.find("group");
394 if (itTypes != outputTypes.end()) {
395 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
398 itTypes = outputTypes.find("accnos");
399 if (itTypes != outputTypes.end()) {
400 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
403 itTypes = outputTypes.find("taxonomy");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
408 itTypes = outputTypes.find("count");
409 if (itTypes != outputTypes.end()) {
410 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
413 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
414 m->mothurOutEndLine();
420 catch(exception& e) {
421 m->errorOut(e, "PcrSeqsCommand", "execute");
425 /**************************************************************************************************/
426 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
429 vector<int> processIDS;
432 int pstart = -1; int pend = -1;
433 bool adjustNeeded = false;
435 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
437 //loop through and create all the processes you want
438 while (process != processors) {
442 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
445 string locationsFile = toString(getpid()) + ".temp";
446 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, pend, adjustNeeded);
448 //pass numSeqs to parent
450 string tempFile = filename + toString(getpid()) + ".num.temp";
451 m->openOutputFile(tempFile, out);
452 out << pstart << '\t' << pend << '\t' << adjustNeeded << endl;
453 out << num << '\t' << badSeqNames.size() << endl;
454 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
455 out << (*it) << endl;
461 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
462 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
467 string locationsFile = toString(getpid()) + ".temp";
468 num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, pend, adjustNeeded);
470 //force parent to wait until all the processes are done
471 for (int i=0;i<processIDS.size();i++) {
472 int temp = processIDS[i];
476 for (int i = 0; i < processIDS.size(); i++) {
478 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
479 m->openInputFile(tempFile, in);
480 int numBadNames = 0; string name = "";
481 int tpstart = -1; int tpend = -1; bool tempAdjust = false;
484 in >> tpstart >> tpend >> tempAdjust; m->gobble(in);
486 if (tempAdjust) { adjustNeeded = true; }
488 if (tpstart != pstart) { adjustNeeded = true; }
489 if (tpstart < pstart) { pstart = tpstart; } //smallest start
492 if (tpend != pend) { adjustNeeded = true; }
493 if (tpend > pend) { pend = tpend; } //largest end
495 int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in);
497 for (int j = 0; j < numBadNames; j++) {
498 in >> name; m->gobble(in);
499 badSeqNames.insert(name);
501 in.close(); m->mothurRemove(tempFile);
503 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
504 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
506 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
507 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
509 m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile);
510 m->mothurRemove((toString(processIDS[i]) + ".temp"));
514 //////////////////////////////////////////////////////////////////////////////////////////////////////
515 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
516 //Above fork() will clone, so memory is separate, but that's not the case with windows,
517 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
518 //////////////////////////////////////////////////////////////////////////////////////////////////////
520 vector<pcrData*> pDataArray;
521 DWORD dwThreadIdArray[processors-1];
522 HANDLE hThreadArray[processors-1];
524 string locationsFile = "locationsFile.txt";
525 m->mothurRemove(locationsFile);
526 m->mothurRemove(goodFileName);
527 m->mothurRemove(badFileName);
529 //Create processor worker threads.
530 for( int i=0; i<processors-1; i++ ){
532 string extension = "";
533 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
535 // Allocate memory for thread data.
536 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, locationsFile+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end);
537 pDataArray.push_back(tempPcr);
539 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
540 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
544 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, pend, adjustNeeded);
545 processIDS.push_back(processors-1);
547 //Wait until all threads have terminated.
548 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
550 //Close all thread handles and free memory allocations.
551 for(int i=0; i < pDataArray.size(); i++){
552 num += pDataArray[i]->count;
553 if (pDataArray[i]->count != pDataArray[i]->fend) {
554 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
556 if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; }
557 if (pDataArray[i]->pstart != -1) {
558 if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; }
559 if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; }
561 if (pDataArray[i]->pend != -1) {
562 if (pDataArray[i]->pend != pend) { adjustNeeded = true; }
563 if (pDataArray[i]->pend > pend) { pend = pDataArray[i]->pend; }
566 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
567 CloseHandle(hThreadArray[i]);
568 delete pDataArray[i];
571 for (int i = 0; i < processIDS.size(); i++) {
572 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
573 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
575 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
576 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
578 m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile);
579 m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp"));
583 if (fileAligned && adjustNeeded) { adjustDots(goodFileName, locationsFile, pstart, pend); }
588 catch(exception& e) {
589 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
594 //**********************************************************************************************************************
595 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set<string>& badSeqNames, linePair filePos, int& pstart, int& pend, bool& adjustNeeded){
598 m->openOutputFile(goodFasta, goodFile);
601 m->openOutputFile(badFasta, badFile);
603 ofstream locationsFile;
604 m->openOutputFile(locationsName, locationsFile);
607 m->openInputFile(filename, inFASTA);
609 inFASTA.seekg(filePos.start);
614 vector< set<int> > locations; //locations[0] = beginning locations, locations[1] = ending locations
617 //pdiffs, bdiffs, primers, barcodes, revPrimers
618 map<string, int> faked;
619 TrimOligos trim(pdiffs, 0, primers, faked, revPrimer);
623 if (m->control_pressed) { break; }
625 Sequence currSeq(inFASTA); m->gobble(inFASTA);
627 if (fileAligned) { //assume aligned until proven otherwise
628 lengths.insert(currSeq.getAligned().length());
629 if (lengths.size() > 1) { fileAligned = false; }
632 string trashCode = "";
633 string locationsString = "";
636 if (currSeq.getName() != "") {
638 if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); }
641 if (oligosfile != "") {
642 map<int, int> mapAligned;
643 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
646 if (primers.size() != 0) {
647 int primerStart = 0; int primerEnd = 0;
648 bool good = trim.findForward(currSeq, primerStart, primerEnd);
650 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
655 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned.
657 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1));
659 thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1);
660 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
665 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
667 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));
669 thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]);
670 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
674 isAligned(currSeq.getAligned(), mapAligned);
676 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
677 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
682 //process reverse primers
683 if (revPrimer.size() != 0) {
684 int primerStart = 0; int primerEnd = 0;
685 bool good = trim.findReverse(currSeq, primerStart, primerEnd);
686 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
691 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
693 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));
695 thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]);
696 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
701 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); }
703 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1));
705 thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1);
706 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
712 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
713 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
717 }else if (ecolifile != "") {
718 //make sure the seqs are aligned
719 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
720 else if (currSeq.getAligned().length() != length) {
721 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
724 currSeq.filterToPos(start);
725 currSeq.filterFromPos(end);
727 string seqString = currSeq.getAligned().substr(0, end);
728 seqString = seqString.substr(start);
729 currSeq.setAligned(seqString);
732 }else{ //using start and end to trim
733 //make sure the seqs are aligned
734 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
737 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
739 if (keepdots) { currSeq.filterFromPos(end); }
741 string seqString = currSeq.getAligned().substr(0, end);
742 currSeq.setAligned(seqString);
747 if (keepdots) { currSeq.filterToPos(start); }
749 string seqString = currSeq.getAligned().substr(start);
750 currSeq.setAligned(seqString);
756 //trimming removed all bases
757 if (currSeq.getUnaligned() == "") { goodSeq = false; }
760 currSeq.printSequence(goodFile);
761 if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); }
762 if (locationsString != "") { locationsFile << locationsString; }
763 if (thisPStart != -1) { locations[0].insert(thisPStart); }
764 if (thisPEnd != -1) { locations[1].insert(thisPEnd); }
767 badSeqNames.insert(currSeq.getName());
768 currSeq.setName(currSeq.getName() + '|' + trashCode);
769 currSeq.printSequence(badFile);
774 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
775 unsigned long long pos = inFASTA.tellg();
776 if ((pos == -1) || (pos >= filePos.end)) { break; }
778 if (inFASTA.eof()) { break; }
782 if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
785 if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
790 locationsFile.close();
792 if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); }
794 if (fileAligned && !keepdots) { //print out smallest start value and largest end value
795 if ((locations[0].size() > 1) || (locations[1].size() > 1)) { adjustNeeded = true; }
796 if (primers.size() != 0) { set<int>::iterator it = locations[0].begin(); pstart = *it; }
797 if (revPrimer.size() != 0) { set<int>::reverse_iterator it2 = locations[1].rbegin(); pend = *it2; }
802 catch(exception& e) {
803 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
807 //********************************************************************/
808 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
811 bool isAligned = false;
814 for (int i = 0; i < seq.length(); i++) {
815 if (!isalpha(seq[i])) { isAligned = true; }
816 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
817 } //ie. the 3rd base may be at spot 10 in the alignment
818 //later when we trim we want to trim from spot 10.
821 catch(exception& e) {
822 m->errorOut(e, "PcrSeqsCommand", "isAligned");
826 //**********************************************************************************************************************
827 int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){
830 m->openInputFile(goodFasta, inFasta);
832 ifstream inLocations;
833 m->openInputFile(locations, inLocations);
836 m->openOutputFile(goodFasta+".temp", out);
839 //cout << pstart << '\t' << pend << endl;
841 while(!inFasta.eof()) {
842 if(m->control_pressed) { break; }
844 Sequence seq(inFasta); m->gobble(inFasta);
847 int thisStart = -1; int thisEnd = -1;
848 if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); }
849 if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); }
850 //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl;
851 //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl;
853 if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); }
856 if (thisStart != -1) {
857 if (thisStart != pstart) {
859 for (int i = pstart; i < thisStart; i++) { dots += "."; }
860 thisEnd += dots.length();
861 dots += seq.getAligned();
862 seq.setAligned(dots);
869 if (thisEnd != pend) {
870 string dots = seq.getAligned();
871 for (int i = thisEnd; i < pend; i++) { dots += "."; }
872 seq.setAligned(dots);
876 lengths.insert(seq.getAligned().length());
879 seq.printSequence(out);
884 m->mothurRemove(locations);
885 m->mothurRemove(goodFasta);
886 m->renameFile(goodFasta+".temp", goodFasta);
888 //cout << "final lengths = \n";
889 //for (set<int>::iterator it = lengths.begin(); it != lengths.end(); it++) {
890 // cout << *it << endl;
895 catch(exception& e) {
896 m->errorOut(e, "PcrSeqsCommand", "adjustDots");
900 //********************************************************************/
901 string PcrSeqsCommand::reverseOligo(string oligo){
905 for(int i=oligo.length()-1;i>=0;i--){
907 if(oligo[i] == 'A') { reverse += 'T'; }
908 else if(oligo[i] == 'T'){ reverse += 'A'; }
909 else if(oligo[i] == 'U'){ reverse += 'A'; }
911 else if(oligo[i] == 'G'){ reverse += 'C'; }
912 else if(oligo[i] == 'C'){ reverse += 'G'; }
914 else if(oligo[i] == 'R'){ reverse += 'Y'; }
915 else if(oligo[i] == 'Y'){ reverse += 'R'; }
917 else if(oligo[i] == 'M'){ reverse += 'K'; }
918 else if(oligo[i] == 'K'){ reverse += 'M'; }
920 else if(oligo[i] == 'W'){ reverse += 'W'; }
921 else if(oligo[i] == 'S'){ reverse += 'S'; }
923 else if(oligo[i] == 'B'){ reverse += 'V'; }
924 else if(oligo[i] == 'V'){ reverse += 'B'; }
926 else if(oligo[i] == 'D'){ reverse += 'H'; }
927 else if(oligo[i] == 'H'){ reverse += 'D'; }
929 else { reverse += 'N'; }
935 catch(exception& e) {
936 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
941 //***************************************************************************************************************
942 bool PcrSeqsCommand::readOligos(){
945 m->openInputFile(oligosfile, inOligos);
947 string type, oligo, group;
950 while(!inOligos.eof()){
954 if(type[0] == '#'){ //ignore
955 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
959 //make type case insensitive
960 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
964 for(int i=0;i<oligo.length();i++){
965 oligo[i] = toupper(oligo[i]);
966 if(oligo[i] == 'U') { oligo[i] = 'T'; }
969 if(type == "FORWARD"){
970 // get rest of line in case there is a primer name
971 while (!inOligos.eof()) {
972 char c = inOligos.get();
973 if (c == 10 || c == 13 || c == -1){ break; }
974 else if (c == 32 || c == 9){;} //space or tab
976 primers[oligo] = primerCount; primerCount++;
977 //cout << "for oligo = " << oligo << endl;
978 }else if(type == "REVERSE"){
979 string oligoRC = reverseOligo(oligo);
980 revPrimer.push_back(oligoRC);
981 //cout << "rev oligo = " << oligo << " reverse = " << oligoRC << endl;
982 }else if(type == "BARCODE"){
984 }else if(type == "PRIMER"){
986 primers[oligo] = primerCount; primerCount++;
991 for(int i=0;i<roligo.length();i++){
992 roligo[i] = toupper(roligo[i]);
993 if(roligo[i] == 'U') { roligo[i] = 'T'; }
995 revPrimer.push_back(reverseOligo(roligo));
997 // get rest of line in case there is a primer name
998 while (!inOligos.eof()) {
999 char c = inOligos.get();
1000 if (c == 10 || c == 13 || c == -1){ break; }
1001 else if (c == 32 || c == 9){;} //space or tab
1003 //cout << "prim oligo = " << oligo << " reverse = " << roligo << endl;
1004 }else if((type == "LINKER")||(type == "SPACER")) {;}
1005 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are primer, forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
1007 m->gobble(inOligos);
1011 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
1012 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
1013 m->control_pressed = true;
1019 }catch(exception& e) {
1020 m->errorOut(e, "PcrSeqsCommand", "readOligos");
1024 //***************************************************************************************************************
1025 bool PcrSeqsCommand::readEcoli(){
1028 m->openInputFile(ecolifile, in);
1033 length = ecoli.getAligned().length();
1034 start = ecoli.getStartPos();
1035 end = ecoli.getEndPos();
1036 }else { in.close(); m->control_pressed = true; return false; }
1041 catch(exception& e) {
1042 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
1047 //***************************************************************************************************************
1048 int PcrSeqsCommand::writeAccnos(set<string> badNames){
1050 string thisOutputDir = outputDir;
1051 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
1052 map<string, string> variables;
1053 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1054 string outputFileName = getOutputFileName("accnos",variables);
1055 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
1058 m->openOutputFile(outputFileName, out);
1060 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
1061 if (m->control_pressed) { break; }
1062 out << (*it) << endl;
1068 catch(exception& e) {
1069 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
1074 //***************************************************************************************************************
1075 int PcrSeqsCommand::readName(set<string>& names){
1077 string thisOutputDir = outputDir;
1078 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
1079 map<string, string> variables;
1080 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
1081 variables["[extension]"] = m->getExtension(namefile);
1082 string outputFileName = getOutputFileName("name", variables);
1085 m->openOutputFile(outputFileName, out);
1088 m->openInputFile(namefile, in);
1089 string name, firstCol, secondCol;
1091 bool wroteSomething = false;
1092 int removedCount = 0;
1095 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1097 in >> firstCol; m->gobble(in);
1100 string savedSecond = secondCol;
1101 vector<string> parsedNames;
1102 m->splitAtComma(secondCol, parsedNames);
1104 vector<string> validSecond; validSecond.clear();
1105 for (int i = 0; i < parsedNames.size(); i++) {
1106 if (names.count(parsedNames[i]) == 0) {
1107 validSecond.push_back(parsedNames[i]);
1111 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1112 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1113 removedCount += parsedNames.size();
1115 out << firstCol << '\t' << savedSecond << endl;
1116 wroteSomething = true;
1123 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1124 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1126 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1130 catch(exception& e) {
1131 m->errorOut(e, "PcrSeqsCommand", "readName");
1135 //**********************************************************************************************************************
1136 int PcrSeqsCommand::readGroup(set<string> names){
1138 string thisOutputDir = outputDir;
1139 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1140 map<string, string> variables;
1141 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1142 variables["[extension]"] = m->getExtension(groupfile);
1143 string outputFileName = getOutputFileName("group", variables);
1146 m->openOutputFile(outputFileName, out);
1149 m->openInputFile(groupfile, in);
1152 bool wroteSomething = false;
1153 int removedCount = 0;
1156 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1158 in >> name; //read from first column
1159 in >> group; //read from second column
1161 //if this name is in the accnos file
1162 if (names.count(name) == 0) {
1163 wroteSomething = true;
1164 out << name << '\t' << group << endl;
1165 }else { removedCount++; }
1172 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1173 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1175 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1180 catch(exception& e) {
1181 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1185 //**********************************************************************************************************************
1186 int PcrSeqsCommand::readTax(set<string> names){
1188 string thisOutputDir = outputDir;
1189 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1190 map<string, string> variables;
1191 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1192 variables["[extension]"] = m->getExtension(taxfile);
1193 string outputFileName = getOutputFileName("taxonomy", variables);
1196 m->openOutputFile(outputFileName, out);
1199 m->openInputFile(taxfile, in);
1202 bool wroteSomething = false;
1203 int removedCount = 0;
1206 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1208 in >> name; //read from first column
1209 in >> tax; //read from second column
1211 //if this name is in the accnos file
1212 if (names.count(name) == 0) {
1213 wroteSomething = true;
1214 out << name << '\t' << tax << endl;
1215 }else { removedCount++; }
1222 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1223 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1225 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1229 catch(exception& e) {
1230 m->errorOut(e, "PcrSeqsCommand", "readTax");
1234 //***************************************************************************************************************
1235 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1238 m->openInputFile(countfile, in);
1239 set<string>::iterator it;
1241 map<string, string> variables;
1242 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1243 variables["[extension]"] = m->getExtension(countfile);
1244 string goodCountFile = getOutputFileName("count", variables);
1246 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1247 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1249 string headers = m->getline(in); m->gobble(in);
1250 goodCountOut << headers << endl;
1252 string name, rest; int thisTotal, removedCount; removedCount = 0;
1253 bool wroteSomething = false;
1256 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1258 in >> name; m->gobble(in);
1259 in >> thisTotal; m->gobble(in);
1260 rest = m->getline(in); m->gobble(in);
1262 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1264 wroteSomething = true;
1265 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1269 goodCountOut.close();
1271 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1273 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1275 //check for groups that have been eliminated
1277 if (ct.testGroups(goodCountFile)) {
1278 ct.readTable(goodCountFile, true);
1279 ct.printTable(goodCountFile);
1282 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1284 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1290 catch(exception& e) {
1291 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1295 /**************************************************************************************/