5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
28 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "PcrSeqsCommand", "setParameters");
41 //**********************************************************************************************************************
42 string PcrSeqsCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The pcr.seqs command reads a fasta file.\n";
46 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n";
47 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
48 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
49 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
50 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
51 helpString += "The processors parameter allows you to use multiple processors.\n";
52 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
53 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
54 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
55 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
56 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
60 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string PcrSeqsCommand::getOutputPattern(string type) {
69 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
70 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
72 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
73 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
74 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
80 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
84 //**********************************************************************************************************************
86 PcrSeqsCommand::PcrSeqsCommand(){
88 abort = true; calledHelp = true;
90 vector<string> tempOutNames;
91 outputTypes["fasta"] = tempOutNames;
92 outputTypes["taxonomy"] = tempOutNames;
93 outputTypes["group"] = tempOutNames;
94 outputTypes["name"] = tempOutNames;
95 outputTypes["count"] = tempOutNames;
96 outputTypes["accnos"] = tempOutNames;
99 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
103 //***************************************************************************************************************
105 PcrSeqsCommand::PcrSeqsCommand(string option) {
108 abort = false; calledHelp = false;
110 //allow user to run help
111 if(option == "help") { help(); abort = true; calledHelp = true; }
112 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string,string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string,string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["fasta"] = tempOutNames;
131 outputTypes["taxonomy"] = tempOutNames;
132 outputTypes["group"] = tempOutNames;
133 outputTypes["name"] = tempOutNames;
134 outputTypes["accnos"] = tempOutNames;
135 outputTypes["count"] = tempOutNames;
137 //if the user changes the input directory command factory will send this info to us in the output parameter
138 string inputDir = validParameter.validFile(parameters, "inputdir", false);
139 if (inputDir == "not found"){ inputDir = ""; }
142 it = parameters.find("fasta");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["fasta"] = inputDir + it->second; }
150 it = parameters.find("oligos");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["oligos"] = inputDir + it->second; }
158 it = parameters.find("ecoli");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
166 it = parameters.find("taxonomy");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
182 it = parameters.find("group");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["group"] = inputDir + it->second; }
190 it = parameters.find("count");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["count"] = inputDir + it->second; }
201 //check for required parameters
202 fastafile = validParameter.validFile(parameters, "fasta", true);
203 if (fastafile == "not found") {
204 fastafile = m->getFastaFile();
205 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
206 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
207 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
208 else { m->setFastaFile(fastafile); }
210 //if the user changes the output directory command factory will send this info to us in the output parameter
211 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
213 //check for optional parameter and set defaults
214 // ...at some point should added some additional type checking...
216 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
217 keepprimer = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
220 keepdots = m->isTrue(temp);
222 temp = validParameter.validFile(parameters, "oligos", true);
223 if (temp == "not found"){ oligosfile = ""; }
224 else if(temp == "not open"){ oligosfile = ""; abort = true; }
225 else { oligosfile = temp; m->setOligosFile(oligosfile); }
227 ecolifile = validParameter.validFile(parameters, "ecoli", true);
228 if (ecolifile == "not found"){ ecolifile = ""; }
229 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
231 namefile = validParameter.validFile(parameters, "name", true);
232 if (namefile == "not found"){ namefile = ""; }
233 else if(namefile == "not open"){ namefile = ""; abort = true; }
234 else { m->setNameFile(namefile); }
236 groupfile = validParameter.validFile(parameters, "group", true);
237 if (groupfile == "not found"){ groupfile = ""; }
238 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
239 else { m->setGroupFile(groupfile); }
241 countfile = validParameter.validFile(parameters, "count", true);
242 if (countfile == "not open") { countfile = ""; abort = true; }
243 else if (countfile == "not found") { countfile = ""; }
244 else { m->setCountTableFile(countfile); }
246 if ((namefile != "") && (countfile != "")) {
247 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
250 if ((groupfile != "") && (countfile != "")) {
251 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
254 taxfile = validParameter.validFile(parameters, "taxonomy", true);
255 if (taxfile == "not found"){ taxfile = ""; }
256 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
257 else { m->setTaxonomyFile(taxfile); }
259 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, start);
262 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
263 m->mothurConvert(temp, end);
265 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
266 m->setProcessors(temp);
267 m->mothurConvert(temp, processors);
269 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, pdiffs);
272 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
274 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
277 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
278 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
281 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
283 if ((ecolifile != "") && (start != -1) && (end != -1)) {
284 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
288 if ((oligosfile != "") && (ecolifile != "")) {
289 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
292 //check to make sure you didn't forget the name file by mistake
293 if (countfile == "") {
294 if (namefile == "") {
295 vector<string> files; files.push_back(fastafile);
296 parser.getNameFile(files);
302 catch(exception& e) {
303 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
307 //***************************************************************************************************************
309 int PcrSeqsCommand::execute(){
312 if (abort == true) { if (calledHelp) { return 0; } return 2; }
314 int start = time(NULL);
317 string thisOutputDir = outputDir;
318 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
319 map<string, string> variables;
320 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
321 variables["[extension]"] = m->getExtension(fastafile);
322 string trimSeqFile = getOutputFileName("fasta",variables);
323 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
324 variables["[tag]"] = "scrap";
325 string badSeqFile = getOutputFileName("fasta",variables);
329 if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(primers.size()) + ", revprimers = " + toString(revPrimer.size()) + ".\n"); } } if (m->control_pressed) { return 0; }
330 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
332 vector<unsigned long long> positions;
333 int numFastaSeqs = 0;
334 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
335 positions = m->divideFile(fastafile, processors);
336 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
338 if (processors == 1) {
339 lines.push_back(linePair(0, 1000));
341 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
342 if (positions.size() < processors) { processors = positions.size(); }
344 //figure out how many sequences you have to process
345 int numSeqsPerProcessor = numFastaSeqs / processors;
346 for (int i = 0; i < processors; i++) {
347 int startIndex = i * numSeqsPerProcessor;
348 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
349 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
353 if (m->control_pressed) { return 0; }
355 set<string> badNames;
356 numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);
358 if (m->control_pressed) { return 0; }
360 //don't write or keep if blank
361 if (badNames.size() != 0) { writeAccnos(badNames); }
362 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
363 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (namefile != "") { readName(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
368 if (groupfile != "") { readGroup(badNames); }
369 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
370 if (taxfile != "") { readTax(badNames); }
371 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 if (countfile != "") { readCount(badNames); }
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 m->mothurOutEndLine();
376 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
377 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
378 m->mothurOutEndLine();
379 m->mothurOutEndLine();
381 //set fasta file as new current fastafile
383 itTypes = outputTypes.find("fasta");
384 if (itTypes != outputTypes.end()) {
385 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
388 itTypes = outputTypes.find("name");
389 if (itTypes != outputTypes.end()) {
390 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
393 itTypes = outputTypes.find("group");
394 if (itTypes != outputTypes.end()) {
395 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
398 itTypes = outputTypes.find("accnos");
399 if (itTypes != outputTypes.end()) {
400 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
403 itTypes = outputTypes.find("taxonomy");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
408 itTypes = outputTypes.find("count");
409 if (itTypes != outputTypes.end()) {
410 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
413 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
414 m->mothurOutEndLine();
420 catch(exception& e) {
421 m->errorOut(e, "PcrSeqsCommand", "execute");
425 /**************************************************************************************************/
426 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
429 vector<int> processIDS;
432 int pstart = -1; int pend = -1;
433 bool adjustNeeded = false;
435 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
437 //loop through and create all the processes you want
438 while (process != processors) {
442 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
445 string locationsFile = toString(getpid()) + ".temp";
446 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", locationsFile, badSeqNames, lines[process], pstart, pend, adjustNeeded);
448 //pass numSeqs to parent
450 string tempFile = filename + toString(getpid()) + ".num.temp";
451 m->openOutputFile(tempFile, out);
452 out << pstart << '\t' << pend << '\t' << adjustNeeded << endl;
453 out << num << '\t' << badSeqNames.size() << endl;
454 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
455 out << (*it) << endl;
461 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
462 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
467 string locationsFile = toString(getpid()) + ".temp";
468 num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, pend, adjustNeeded);
470 //force parent to wait until all the processes are done
471 for (int i=0;i<processIDS.size();i++) {
472 int temp = processIDS[i];
476 for (int i = 0; i < processIDS.size(); i++) {
478 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
479 m->openInputFile(tempFile, in);
480 int numBadNames = 0; string name = "";
481 int tpstart = -1; int tpend = -1; bool tempAdjust = false;
484 in >> tpstart >> tpend >> tempAdjust; m->gobble(in);
486 if (tempAdjust) { adjustNeeded = true; }
488 if (tpstart != pstart) { adjustNeeded = true; }
489 if (tpstart < pstart) { pstart = tpstart; } //smallest start
492 if (tpend != pend) { adjustNeeded = true; }
493 if (tpend > pend) { pend = tpend; } //largest end
495 int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in);
497 for (int j = 0; j < numBadNames; j++) {
498 in >> name; m->gobble(in);
499 badSeqNames.insert(name);
501 in.close(); m->mothurRemove(tempFile);
503 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
504 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
506 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
507 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
509 m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile);
510 m->mothurRemove((toString(processIDS[i]) + ".temp"));
514 //////////////////////////////////////////////////////////////////////////////////////////////////////
515 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
516 //Above fork() will clone, so memory is separate, but that's not the case with windows,
517 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
518 //////////////////////////////////////////////////////////////////////////////////////////////////////
520 vector<pcrData*> pDataArray;
521 DWORD dwThreadIdArray[processors-1];
522 HANDLE hThreadArray[processors-1];
524 string locationsFile = "locationsFile.txt";
525 m->mothurRemove(locationsFile);
526 m->mothurRemove(goodFileName);
527 m->mothurRemove(badFileName);
529 //Create processor worker threads.
530 for( int i=0; i<processors-1; i++ ){
532 string extension = "";
533 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
535 // Allocate memory for thread data.
536 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, locationsFile+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end);
537 pDataArray.push_back(tempPcr);
539 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
540 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
544 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, pend, adjustNeeded);
545 processIDS.push_back(processors-1);
547 //Wait until all threads have terminated.
548 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
550 //Close all thread handles and free memory allocations.
551 for(int i=0; i < pDataArray.size(); i++){
552 num += pDataArray[i]->count;
553 if (pDataArray[i]->count != pDataArray[i]->fend) {
554 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
556 if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; }
557 if (pDataArray[i]->pstart != -1) {
558 if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; }
559 if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; }
561 if (pDataArray[i]->pend != -1) {
562 if (pDataArray[i]->pend != pend) { adjustNeeded = true; }
563 if (pDataArray[i]->pend > pend) { pend = pDataArray[i]->pend; }
566 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
567 CloseHandle(hThreadArray[i]);
568 delete pDataArray[i];
571 for (int i = 0; i < processIDS.size(); i++) {
572 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
573 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
575 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
576 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
578 m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile);
579 m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp"));
583 if (fileAligned && adjustNeeded) { adjustDots(goodFileName, locationsFile, pstart, pend); }
588 catch(exception& e) {
589 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
594 //**********************************************************************************************************************
595 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set<string>& badSeqNames, linePair filePos, int& pstart, int& pend, bool& adjustNeeded){
598 m->openOutputFile(goodFasta, goodFile);
601 m->openOutputFile(badFasta, badFile);
603 ofstream locationsFile;
604 m->openOutputFile(locationsName, locationsFile);
607 m->openInputFile(filename, inFASTA);
609 inFASTA.seekg(filePos.start);
614 vector< set<int> > locations; //locations[0] = beginning locations, locations[1] = ending locations
617 //pdiffs, bdiffs, primers, barcodes, revPrimers
618 map<string, int> faked;
619 TrimOligos trim(pdiffs, 0, primers, faked, revPrimer);
623 if (m->control_pressed) { break; }
625 Sequence currSeq(inFASTA); m->gobble(inFASTA);
627 if (fileAligned) { //assume aligned until proven otherwise
628 lengths.insert(currSeq.getAligned().length());
629 if (lengths.size() > 1) { fileAligned = false; }
632 string trashCode = "";
633 string locationsString = "";
636 if (currSeq.getName() != "") {
638 if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); }
641 if (oligosfile != "") {
642 map<int, int> mapAligned;
643 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
646 if (primers.size() != 0) {
647 int primerStart = 0; int primerEnd = 0;
648 bool good = trim.findForward(currSeq, primerStart, primerEnd);
650 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
655 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned.
657 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1));
659 thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1);
660 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
665 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
667 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));
669 thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]);
670 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
674 isAligned(currSeq.getAligned(), mapAligned);
676 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
677 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
682 //process reverse primers
683 if (revPrimer.size() != 0) {
684 int primerStart = 0; int primerEnd = 0;
685 bool good = trim.findReverse(currSeq, primerStart, primerEnd);
686 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
691 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
693 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));
695 thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]);
696 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
701 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); }
703 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1));
705 thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1);
706 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
712 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
713 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
717 }else if (ecolifile != "") {
718 //make sure the seqs are aligned
719 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
720 else if (currSeq.getAligned().length() != length) {
721 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
724 currSeq.filterToPos(start);
725 currSeq.filterFromPos(end);
727 string seqString = currSeq.getAligned().substr(0, end);
728 seqString = seqString.substr(start);
729 currSeq.setAligned(seqString);
732 }else{ //using start and end to trim
733 //make sure the seqs are aligned
734 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
737 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
739 if (keepdots) { currSeq.filterFromPos(end); }
741 string seqString = currSeq.getAligned().substr(0, end);
742 currSeq.setAligned(seqString);
747 if (keepdots) { currSeq.filterToPos(start); }
749 string seqString = currSeq.getAligned().substr(start);
750 currSeq.setAligned(seqString);
756 //trimming removed all bases
757 if (currSeq.getUnaligned() == "") { goodSeq = false; }
760 currSeq.printSequence(goodFile);
761 if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); }
762 if (locationsString != "") { locationsFile << locationsString; }
763 if (thisPStart != -1) { locations[0].insert(thisPStart); }
764 if (thisPEnd != -1) { locations[1].insert(thisPEnd); }
767 badSeqNames.insert(currSeq.getName());
768 currSeq.setName(currSeq.getName() + '|' + trashCode);
769 currSeq.printSequence(badFile);
774 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
775 unsigned long long pos = inFASTA.tellg();
776 if ((pos == -1) || (pos >= filePos.end)) { break; }
778 if (inFASTA.eof()) { break; }
782 if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
785 if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
790 locationsFile.close();
792 if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); }
794 if (fileAligned && !keepdots) { //print out smallest start value and largest end value
795 if ((locations[0].size() > 1) || (locations[1].size() > 1)) { adjustNeeded = true; }
796 if (primers.size() != 0) { set<int>::iterator it = locations[0].begin(); pstart = *it; }
797 if (revPrimer.size() != 0) { set<int>::reverse_iterator it2 = locations[1].rbegin(); pend = *it2; }
802 catch(exception& e) {
803 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
807 //********************************************************************/
808 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
811 bool isAligned = false;
814 for (int i = 0; i < seq.length(); i++) {
815 if (!isalpha(seq[i])) { isAligned = true; }
816 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
817 } //ie. the 3rd base may be at spot 10 in the alignment
818 //later when we trim we want to trim from spot 10.
821 catch(exception& e) {
822 m->errorOut(e, "PcrSeqsCommand", "isAligned");
826 //**********************************************************************************************************************
827 int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){
830 m->openInputFile(goodFasta, inFasta);
832 ifstream inLocations;
833 m->openInputFile(locations, inLocations);
836 m->openOutputFile(goodFasta+".temp", out);
839 //cout << pstart << '\t' << pend << endl;
841 while(!inFasta.eof()) {
842 if(m->control_pressed) { break; }
844 Sequence seq(inFasta); m->gobble(inFasta);
847 int thisStart = -1; int thisEnd = -1;
848 if (primers.size() != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); }
849 if (revPrimer.size() != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); }
850 //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl;
851 //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl;
853 if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); }
856 if (thisStart != -1) {
857 if (thisStart != pstart) {
859 for (int i = pstart; i < thisStart; i++) { dots += "."; }
860 thisEnd += dots.length();
861 dots += seq.getAligned();
862 seq.setAligned(dots);
869 if (thisEnd != pend) {
870 string dots = seq.getAligned();
871 for (int i = thisEnd; i < pend; i++) { dots += "."; }
872 seq.setAligned(dots);
876 lengths.insert(seq.getAligned().length());
879 seq.printSequence(out);
884 m->mothurRemove(locations);
885 m->mothurRemove(goodFasta);
886 m->renameFile(goodFasta+".temp", goodFasta);
888 //cout << "final lengths = \n";
889 //for (set<int>::iterator it = lengths.begin(); it != lengths.end(); it++) {
890 // cout << *it << endl;
895 catch(exception& e) {
896 m->errorOut(e, "PcrSeqsCommand", "adjustDots");
900 //********************************************************************/
901 string PcrSeqsCommand::reverseOligo(string oligo){
905 for(int i=oligo.length()-1;i>=0;i--){
907 if(oligo[i] == 'A') { reverse += 'T'; }
908 else if(oligo[i] == 'T'){ reverse += 'A'; }
909 else if(oligo[i] == 'U'){ reverse += 'A'; }
911 else if(oligo[i] == 'G'){ reverse += 'C'; }
912 else if(oligo[i] == 'C'){ reverse += 'G'; }
914 else if(oligo[i] == 'R'){ reverse += 'Y'; }
915 else if(oligo[i] == 'Y'){ reverse += 'R'; }
917 else if(oligo[i] == 'M'){ reverse += 'K'; }
918 else if(oligo[i] == 'K'){ reverse += 'M'; }
920 else if(oligo[i] == 'W'){ reverse += 'W'; }
921 else if(oligo[i] == 'S'){ reverse += 'S'; }
923 else if(oligo[i] == 'B'){ reverse += 'V'; }
924 else if(oligo[i] == 'V'){ reverse += 'B'; }
926 else if(oligo[i] == 'D'){ reverse += 'H'; }
927 else if(oligo[i] == 'H'){ reverse += 'D'; }
929 else { reverse += 'N'; }
935 catch(exception& e) {
936 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
941 //***************************************************************************************************************
942 bool PcrSeqsCommand::readOligos(){
945 m->openInputFile(oligosfile, inOligos);
947 string type, oligo, group;
950 while(!inOligos.eof()){
954 if(type[0] == '#'){ //ignore
955 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
959 //make type case insensitive
960 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
964 for(int i=0;i<oligo.length();i++){
965 oligo[i] = toupper(oligo[i]);
966 if(oligo[i] == 'U') { oligo[i] = 'T'; }
969 if(type == "FORWARD"){
970 // get rest of line in case there is a primer name
971 while (!inOligos.eof()) {
972 char c = inOligos.get();
973 if (c == 10 || c == 13 || c == -1){ break; }
974 else if (c == 32 || c == 9){;} //space or tab
976 primers[oligo] = primerCount; primerCount++;
977 }else if(type == "REVERSE"){
978 string oligoRC = reverseOligo(oligo);
979 revPrimer.push_back(oligoRC);
980 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
981 }else if(type == "BARCODE"){
983 }else if((type == "LINKER")||(type == "SPACER")) {;}
984 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
990 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
991 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
992 m->control_pressed = true;
998 }catch(exception& e) {
999 m->errorOut(e, "PcrSeqsCommand", "readOligos");
1003 //***************************************************************************************************************
1004 bool PcrSeqsCommand::readEcoli(){
1007 m->openInputFile(ecolifile, in);
1012 length = ecoli.getAligned().length();
1013 start = ecoli.getStartPos();
1014 end = ecoli.getEndPos();
1015 }else { in.close(); m->control_pressed = true; return false; }
1020 catch(exception& e) {
1021 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
1026 //***************************************************************************************************************
1027 int PcrSeqsCommand::writeAccnos(set<string> badNames){
1029 string thisOutputDir = outputDir;
1030 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
1031 map<string, string> variables;
1032 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
1033 string outputFileName = getOutputFileName("accnos",variables);
1034 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
1037 m->openOutputFile(outputFileName, out);
1039 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
1040 if (m->control_pressed) { break; }
1041 out << (*it) << endl;
1047 catch(exception& e) {
1048 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
1053 //***************************************************************************************************************
1054 int PcrSeqsCommand::readName(set<string>& names){
1056 string thisOutputDir = outputDir;
1057 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
1058 map<string, string> variables;
1059 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
1060 variables["[extension]"] = m->getExtension(namefile);
1061 string outputFileName = getOutputFileName("name", variables);
1064 m->openOutputFile(outputFileName, out);
1067 m->openInputFile(namefile, in);
1068 string name, firstCol, secondCol;
1070 bool wroteSomething = false;
1071 int removedCount = 0;
1074 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1076 in >> firstCol; m->gobble(in);
1079 string savedSecond = secondCol;
1080 vector<string> parsedNames;
1081 m->splitAtComma(secondCol, parsedNames);
1083 vector<string> validSecond; validSecond.clear();
1084 for (int i = 0; i < parsedNames.size(); i++) {
1085 if (names.count(parsedNames[i]) == 0) {
1086 validSecond.push_back(parsedNames[i]);
1090 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1091 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1092 removedCount += parsedNames.size();
1094 out << firstCol << '\t' << savedSecond << endl;
1095 wroteSomething = true;
1102 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1103 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1105 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1109 catch(exception& e) {
1110 m->errorOut(e, "PcrSeqsCommand", "readName");
1114 //**********************************************************************************************************************
1115 int PcrSeqsCommand::readGroup(set<string> names){
1117 string thisOutputDir = outputDir;
1118 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1119 map<string, string> variables;
1120 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1121 variables["[extension]"] = m->getExtension(groupfile);
1122 string outputFileName = getOutputFileName("group", variables);
1125 m->openOutputFile(outputFileName, out);
1128 m->openInputFile(groupfile, in);
1131 bool wroteSomething = false;
1132 int removedCount = 0;
1135 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1137 in >> name; //read from first column
1138 in >> group; //read from second column
1140 //if this name is in the accnos file
1141 if (names.count(name) == 0) {
1142 wroteSomething = true;
1143 out << name << '\t' << group << endl;
1144 }else { removedCount++; }
1151 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1152 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1154 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1159 catch(exception& e) {
1160 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1164 //**********************************************************************************************************************
1165 int PcrSeqsCommand::readTax(set<string> names){
1167 string thisOutputDir = outputDir;
1168 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1169 map<string, string> variables;
1170 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1171 variables["[extension]"] = m->getExtension(taxfile);
1172 string outputFileName = getOutputFileName("taxonomy", variables);
1175 m->openOutputFile(outputFileName, out);
1178 m->openInputFile(taxfile, in);
1181 bool wroteSomething = false;
1182 int removedCount = 0;
1185 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1187 in >> name; //read from first column
1188 in >> tax; //read from second column
1190 //if this name is in the accnos file
1191 if (names.count(name) == 0) {
1192 wroteSomething = true;
1193 out << name << '\t' << tax << endl;
1194 }else { removedCount++; }
1201 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1202 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1204 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1208 catch(exception& e) {
1209 m->errorOut(e, "PcrSeqsCommand", "readTax");
1213 //***************************************************************************************************************
1214 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1217 m->openInputFile(countfile, in);
1218 set<string>::iterator it;
1220 map<string, string> variables;
1221 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1222 variables["[extension]"] = m->getExtension(countfile);
1223 string goodCountFile = getOutputFileName("count", variables);
1225 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1226 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1228 string headers = m->getline(in); m->gobble(in);
1229 goodCountOut << headers << endl;
1231 string name, rest; int thisTotal, removedCount; removedCount = 0;
1232 bool wroteSomething = false;
1235 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1237 in >> name; m->gobble(in);
1238 in >> thisTotal; m->gobble(in);
1239 rest = m->getline(in); m->gobble(in);
1241 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1243 wroteSomething = true;
1244 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1248 goodCountOut.close();
1250 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1252 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1254 //check for groups that have been eliminated
1256 if (ct.testGroups(goodCountFile)) {
1257 ct.readTable(goodCountFile, true);
1258 ct.printTable(goodCountFile);
1261 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1263 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1269 catch(exception& e) {
1270 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1274 /**************************************************************************************/