5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none","fasta",false,true,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false,true); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "NameCount", "none", "none","name",false,false,true); parameters.push_back(pname);
17 CommandParameter pcount("count", "InputTypes", "", "", "NameCount-CountGroup", "none", "none","count",false,false,true); parameters.push_back(pcount);
18 CommandParameter pgroup("group", "InputTypes", "", "", "CountGroup", "none", "none","group",false,false,true); parameters.push_back(pgroup);
19 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none","taxonomy",false,false,true); parameters.push_back(ptax);
20 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none","",false,false); parameters.push_back(pecoli);
21 CommandParameter pstart("start", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pstart);
22 CommandParameter pend("end", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(pend);
23 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "","",false,false); parameters.push_back(pnomatch);
24 CommandParameter ppdiffs("pdiffs", "Number", "", "0", "", "", "","",false,false,true); parameters.push_back(ppdiffs);
26 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
27 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "","",false,false); parameters.push_back(pkeepprimer);
28 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "","",false,false); parameters.push_back(pkeepdots);
29 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
30 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
32 vector<string> myArray;
33 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
37 m->errorOut(e, "PcrSeqsCommand", "setParameters");
41 //**********************************************************************************************************************
42 string PcrSeqsCommand::getHelpString(){
44 string helpString = "";
45 helpString += "The pcr.seqs command reads a fasta file.\n";
46 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, count, taxonomy, ecoli, start, end, nomatch, pdiffs, processors, keepprimer and keepdots.\n";
47 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
48 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
49 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
50 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
51 helpString += "The processors parameter allows you to use multiple processors.\n";
52 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
53 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
54 helpString += "The pdiffs parameter is used to specify the number of differences allowed in the primer. The default is 0.\n";
55 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
56 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
60 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
64 //**********************************************************************************************************************
65 string PcrSeqsCommand::getOutputPattern(string type) {
69 if (type == "fasta") { pattern = "[filename],pcr,[extension]-[filename],[tag],pcr,[extension]"; }
70 else if (type == "taxonomy") { pattern = "[filename],pcr,[extension]"; }
71 else if (type == "name") { pattern = "[filename],pcr,[extension]"; }
72 else if (type == "group") { pattern = "[filename],pcr,[extension]"; }
73 else if (type == "count") { pattern = "[filename],pcr,[extension]"; }
74 else if (type == "accnos") { pattern = "[filename],bad.accnos"; }
75 else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true; }
80 m->errorOut(e, "PcrSeqsCommand", "getOutputPattern");
84 //**********************************************************************************************************************
86 PcrSeqsCommand::PcrSeqsCommand(){
88 abort = true; calledHelp = true;
90 vector<string> tempOutNames;
91 outputTypes["fasta"] = tempOutNames;
92 outputTypes["taxonomy"] = tempOutNames;
93 outputTypes["group"] = tempOutNames;
94 outputTypes["name"] = tempOutNames;
95 outputTypes["count"] = tempOutNames;
96 outputTypes["accnos"] = tempOutNames;
99 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
103 //***************************************************************************************************************
105 PcrSeqsCommand::PcrSeqsCommand(string option) {
108 abort = false; calledHelp = false;
110 //allow user to run help
111 if(option == "help") { help(); abort = true; calledHelp = true; }
112 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string,string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string,string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["fasta"] = tempOutNames;
131 outputTypes["taxonomy"] = tempOutNames;
132 outputTypes["group"] = tempOutNames;
133 outputTypes["name"] = tempOutNames;
134 outputTypes["accnos"] = tempOutNames;
135 outputTypes["count"] = tempOutNames;
137 //if the user changes the input directory command factory will send this info to us in the output parameter
138 string inputDir = validParameter.validFile(parameters, "inputdir", false);
139 if (inputDir == "not found"){ inputDir = ""; }
142 it = parameters.find("fasta");
143 //user has given a template file
144 if(it != parameters.end()){
145 path = m->hasPath(it->second);
146 //if the user has not given a path then, add inputdir. else leave path alone.
147 if (path == "") { parameters["fasta"] = inputDir + it->second; }
150 it = parameters.find("oligos");
151 //user has given a template file
152 if(it != parameters.end()){
153 path = m->hasPath(it->second);
154 //if the user has not given a path then, add inputdir. else leave path alone.
155 if (path == "") { parameters["oligos"] = inputDir + it->second; }
158 it = parameters.find("ecoli");
159 //user has given a template file
160 if(it != parameters.end()){
161 path = m->hasPath(it->second);
162 //if the user has not given a path then, add inputdir. else leave path alone.
163 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
166 it = parameters.find("taxonomy");
167 //user has given a template file
168 if(it != parameters.end()){
169 path = m->hasPath(it->second);
170 //if the user has not given a path then, add inputdir. else leave path alone.
171 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
174 it = parameters.find("name");
175 //user has given a template file
176 if(it != parameters.end()){
177 path = m->hasPath(it->second);
178 //if the user has not given a path then, add inputdir. else leave path alone.
179 if (path == "") { parameters["name"] = inputDir + it->second; }
182 it = parameters.find("group");
183 //user has given a template file
184 if(it != parameters.end()){
185 path = m->hasPath(it->second);
186 //if the user has not given a path then, add inputdir. else leave path alone.
187 if (path == "") { parameters["group"] = inputDir + it->second; }
190 it = parameters.find("count");
191 //user has given a template file
192 if(it != parameters.end()){
193 path = m->hasPath(it->second);
194 //if the user has not given a path then, add inputdir. else leave path alone.
195 if (path == "") { parameters["count"] = inputDir + it->second; }
201 //check for required parameters
202 fastafile = validParameter.validFile(parameters, "fasta", true);
203 if (fastafile == "not found") {
204 fastafile = m->getFastaFile();
205 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
206 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
207 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
208 else { m->setFastaFile(fastafile); }
210 //if the user changes the output directory command factory will send this info to us in the output parameter
211 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
213 //check for optional parameter and set defaults
214 // ...at some point should added some additional type checking...
216 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
217 keepprimer = m->isTrue(temp);
219 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
220 keepdots = m->isTrue(temp);
222 temp = validParameter.validFile(parameters, "oligos", true);
223 if (temp == "not found"){ oligosfile = ""; }
224 else if(temp == "not open"){ oligosfile = ""; abort = true; }
225 else { oligosfile = temp; m->setOligosFile(oligosfile); }
227 ecolifile = validParameter.validFile(parameters, "ecoli", true);
228 if (ecolifile == "not found"){ ecolifile = ""; }
229 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
231 namefile = validParameter.validFile(parameters, "name", true);
232 if (namefile == "not found"){ namefile = ""; }
233 else if(namefile == "not open"){ namefile = ""; abort = true; }
234 else { m->setNameFile(namefile); }
236 groupfile = validParameter.validFile(parameters, "group", true);
237 if (groupfile == "not found"){ groupfile = ""; }
238 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
239 else { m->setGroupFile(groupfile); }
241 countfile = validParameter.validFile(parameters, "count", true);
242 if (countfile == "not open") { countfile = ""; abort = true; }
243 else if (countfile == "not found") { countfile = ""; }
244 else { m->setCountTableFile(countfile); }
246 if ((namefile != "") && (countfile != "")) {
247 m->mothurOut("[ERROR]: you may only use one of the following: name or count."); m->mothurOutEndLine(); abort = true;
250 if ((groupfile != "") && (countfile != "")) {
251 m->mothurOut("[ERROR]: you may only use one of the following: group or count."); m->mothurOutEndLine(); abort=true;
254 taxfile = validParameter.validFile(parameters, "taxonomy", true);
255 if (taxfile == "not found"){ taxfile = ""; }
256 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
257 else { m->setTaxonomyFile(taxfile); }
259 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
260 m->mothurConvert(temp, start);
262 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
263 m->mothurConvert(temp, end);
265 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
266 m->setProcessors(temp);
267 m->mothurConvert(temp, processors);
269 temp = validParameter.validFile(parameters, "pdiffs", false); if (temp == "not found") { temp = "0"; }
270 m->mothurConvert(temp, pdiffs);
272 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
274 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
277 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
278 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
281 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
283 if ((ecolifile != "") && (start != -1) && (end != -1)) {
284 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
288 if ((oligosfile != "") && (ecolifile != "")) {
289 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
292 //check to make sure you didn't forget the name file by mistake
293 if (countfile == "") {
294 if (namefile == "") {
295 vector<string> files; files.push_back(fastafile);
296 parser.getNameFile(files);
302 catch(exception& e) {
303 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
307 //***************************************************************************************************************
309 int PcrSeqsCommand::execute(){
312 if (abort == true) { if (calledHelp) { return 0; } return 2; }
314 int start = time(NULL);
315 fileAligned = true; pairedOligos = false;
317 string thisOutputDir = outputDir;
318 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
319 map<string, string> variables;
320 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
321 variables["[extension]"] = m->getExtension(fastafile);
322 string trimSeqFile = getOutputFileName("fasta",variables);
323 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
324 variables["[tag]"] = "scrap";
325 string badSeqFile = getOutputFileName("fasta",variables);
329 if(oligosfile != ""){ readOligos(); if (m->debug) { m->mothurOut("[DEBUG]: read oligos file. numprimers = " + toString(numFPrimers) + ", revprimers = " + toString(numRPrimers) + ".\n"); } } if (m->control_pressed) { return 0; }
330 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
332 vector<unsigned long long> positions;
333 int numFastaSeqs = 0;
334 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
335 positions = m->divideFile(fastafile, processors);
336 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
338 if (processors == 1) {
339 lines.push_back(linePair(0, 1000));
341 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
342 if (positions.size() < processors) { processors = positions.size(); }
344 //figure out how many sequences you have to process
345 int numSeqsPerProcessor = numFastaSeqs / processors;
346 for (int i = 0; i < processors; i++) {
347 int startIndex = i * numSeqsPerProcessor;
348 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
349 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
353 if (m->control_pressed) { return 0; }
355 set<string> badNames;
356 numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames);
358 if (m->control_pressed) { return 0; }
360 //don't write or keep if blank
361 if (badNames.size() != 0) { writeAccnos(badNames); }
362 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
363 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
365 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
366 if (namefile != "") { readName(badNames); }
367 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
368 if (groupfile != "") { readGroup(badNames); }
369 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
370 if (taxfile != "") { readTax(badNames); }
371 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
372 if (countfile != "") { readCount(badNames); }
373 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
375 m->mothurOutEndLine();
376 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
377 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
378 m->mothurOutEndLine();
379 m->mothurOutEndLine();
381 //set fasta file as new current fastafile
383 itTypes = outputTypes.find("fasta");
384 if (itTypes != outputTypes.end()) {
385 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
388 itTypes = outputTypes.find("name");
389 if (itTypes != outputTypes.end()) {
390 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
393 itTypes = outputTypes.find("group");
394 if (itTypes != outputTypes.end()) {
395 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
398 itTypes = outputTypes.find("accnos");
399 if (itTypes != outputTypes.end()) {
400 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
403 itTypes = outputTypes.find("taxonomy");
404 if (itTypes != outputTypes.end()) {
405 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
408 itTypes = outputTypes.find("count");
409 if (itTypes != outputTypes.end()) {
410 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setCountTableFile(current); }
413 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
414 m->mothurOutEndLine();
420 catch(exception& e) {
421 m->errorOut(e, "PcrSeqsCommand", "execute");
425 /**************************************************************************************************/
426 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
429 vector<int> processIDS;
432 int pstart = -1; int pend = -1;
433 bool adjustNeeded = false;
435 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
437 //loop through and create all the processes you want
438 while (process != processors) {
442 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
445 string locationsFile = m->mothurGetpid(process) + ".temp";
446 num = driverPcr(filename, goodFileName + m->mothurGetpid(process) + ".temp", badFileName + m->mothurGetpid(process) + ".temp", locationsFile, badSeqNames, lines[process], pstart, adjustNeeded);
448 //pass numSeqs to parent
450 string tempFile = filename + m->mothurGetpid(process) + ".num.temp";
451 m->openOutputFile(tempFile, out);
452 out << pstart << '\t' << adjustNeeded << endl;
453 out << num << '\t' << badSeqNames.size() << endl;
454 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
455 out << (*it) << endl;
461 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
462 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
467 string locationsFile = m->mothurGetpid(process) + ".temp";
468 num = driverPcr(filename, goodFileName, badFileName, locationsFile, badSeqNames, lines[0], pstart, adjustNeeded);
470 //force parent to wait until all the processes are done
471 for (int i=0;i<processIDS.size();i++) {
472 int temp = processIDS[i];
476 for (int i = 0; i < processIDS.size(); i++) {
478 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
479 m->openInputFile(tempFile, in);
480 int numBadNames = 0; string name = "";
481 int tpstart = -1; bool tempAdjust = false;
484 in >> tpstart >> tempAdjust; m->gobble(in);
486 if (tempAdjust) { adjustNeeded = true; }
488 if (tpstart != pstart) { adjustNeeded = true; }
489 if (tpstart < pstart) { pstart = tpstart; } //smallest start
491 int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in);
493 for (int j = 0; j < numBadNames; j++) {
494 in >> name; m->gobble(in);
495 badSeqNames.insert(name);
497 in.close(); m->mothurRemove(tempFile);
499 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
500 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
502 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
503 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
505 m->appendFiles((toString(processIDS[i]) + ".temp"), locationsFile);
506 m->mothurRemove((toString(processIDS[i]) + ".temp"));
510 //////////////////////////////////////////////////////////////////////////////////////////////////////
511 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
512 //Above fork() will clone, so memory is separate, but that's not the case with windows,
513 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
514 //////////////////////////////////////////////////////////////////////////////////////////////////////
516 vector<pcrData*> pDataArray;
517 DWORD dwThreadIdArray[processors-1];
518 HANDLE hThreadArray[processors-1];
520 string locationsFile = "locationsFile.txt";
521 m->mothurRemove(locationsFile);
522 m->mothurRemove(goodFileName);
523 m->mothurRemove(badFileName);
525 //Create processor worker threads.
526 for( int i=0; i<processors-1; i++ ){
528 string extension = "";
529 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
531 // Allocate memory for thread data.
532 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, locationsFile+extension, m, oligosfile, ecolifile, nomatch, keepprimer, keepdots, start, end, length, pdiffs, lines[i].start, lines[i].end);
533 pDataArray.push_back(tempPcr);
535 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
536 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
540 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"), (locationsFile+toString(processors-1)+".temp"), badSeqNames, lines[processors-1], pstart, adjustNeeded);
541 processIDS.push_back(processors-1);
543 //Wait until all threads have terminated.
544 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
546 //Close all thread handles and free memory allocations.
547 for(int i=0; i < pDataArray.size(); i++){
548 num += pDataArray[i]->count;
549 if (pDataArray[i]->count != pDataArray[i]->fend) {
550 m->mothurOut("[ERROR]: process " + toString(i) + " only processed " + toString(pDataArray[i]->count) + " of " + toString(pDataArray[i]->fend) + " sequences assigned to it, quitting. \n"); m->control_pressed = true;
552 if (pDataArray[i]->adjustNeeded) { adjustNeeded = true; }
553 if (pDataArray[i]->pstart != -1) {
554 if (pDataArray[i]->pstart != pstart) { adjustNeeded = true; }
555 if (pDataArray[i]->pstart < pstart) { pstart = pDataArray[i]->pstart; }
558 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
559 CloseHandle(hThreadArray[i]);
560 delete pDataArray[i];
563 for (int i = 0; i < processIDS.size(); i++) {
564 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
565 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
567 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
568 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
570 m->appendFiles((locationsFile+toString(processIDS[i]) + ".temp"), locationsFile);
571 m->mothurRemove((locationsFile+toString(processIDS[i]) + ".temp"));
578 if (fileAligned && adjustNeeded) {
579 //find pend - pend is the biggest ending value, but we must account for when we adjust the start. That adjustment may make the "new" end larger then the largest end. So lets find out what that "new" end will be.
580 ifstream inLocations;
581 m->openInputFile(locationsFile, inLocations);
583 while(!inLocations.eof()) {
585 if (m->control_pressed) { break; }
588 int thisStart = -1; int thisEnd = -1;
589 if (numFPrimers != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); }
590 if (numRPrimers != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); }
591 else { pend = -1; break; }
595 if (thisStart != -1) {
596 if (thisStart != pstart) { myDiff += (thisStart - pstart); }
600 int myEnd = thisEnd + myDiff;
601 //cout << name << '\t' << thisStart << '\t' << thisEnd << " diff = " << myDiff << '\t' << myEnd << endl;
604 if (myEnd > pend) { pend = myEnd; }
610 adjustDots(goodFileName, locationsFile, pstart, pend);
611 }else { m->mothurRemove(locationsFile); }
616 catch(exception& e) {
617 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
622 //**********************************************************************************************************************
623 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, string locationsName, set<string>& badSeqNames, linePair filePos, int& pstart, bool& adjustNeeded){
626 m->openOutputFile(goodFasta, goodFile);
629 m->openOutputFile(badFasta, badFile);
631 ofstream locationsFile;
632 m->openOutputFile(locationsName, locationsFile);
635 m->openInputFile(filename, inFASTA);
637 inFASTA.seekg(filePos.start);
642 set<int> locations; //locations[0] = beginning locations,
644 //pdiffs, bdiffs, primers, barcodes, revPrimers
645 map<string, int> primers;
646 map<string, int> barcodes; //not used
647 vector<string> revPrimer;
649 map<int, oligosPair> primerPairs = oligos.getPairedPrimers();
650 for (map<int, oligosPair>::iterator it = primerPairs.begin(); it != primerPairs.end(); it++) {
651 primers[(it->second).forward] = it->first;
652 revPrimer.push_back((it->second).reverse);
654 if (pdiffs != 0) { m->mothurOut("[WARNING]: Pcr.seqs is only designed to allow diffs in forward primers. Reverse primers must be an exact match.\n"); }
656 primers = oligos.getPrimers();
657 revPrimer = oligos.getReversePrimers();
660 TrimOligos trim(pdiffs, 0, primers, barcodes, revPrimer);
664 if (m->control_pressed) { break; }
666 Sequence currSeq(inFASTA); m->gobble(inFASTA);
668 if (fileAligned) { //assume aligned until proven otherwise
669 lengths.insert(currSeq.getAligned().length());
670 if (lengths.size() > 1) { fileAligned = false; }
673 string trashCode = "";
674 string locationsString = "";
677 if (currSeq.getName() != "") {
679 if (m->debug) { m->mothurOut("[DEBUG]: seq name = " + currSeq.getName() + ".\n"); }
682 if (oligosfile != "") {
683 map<int, int> mapAligned;
684 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
687 if (primers.size() != 0) {
688 int primerStart = 0; int primerEnd = 0;
689 bool good = trim.findForward(currSeq, primerStart, primerEnd);
691 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
696 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd-1]+1); } //mapAligned[primerEnd-1] is the location of the last base in the primer. we want to trim to the space just after that. The -1 & +1 ensures if the primer is followed by gaps they are not trimmed causing an aligned sequence dataset to become unaligned.
698 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd-1]+1));
700 thisPStart = mapAligned[primerEnd-1]+1; //locations[0].insert(mapAligned[primerEnd-1]+1);
701 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
706 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
708 currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart]));
710 thisPStart = mapAligned[primerStart]; //locations[0].insert(mapAligned[primerStart]);
711 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
715 isAligned(currSeq.getAligned(), mapAligned);
717 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
718 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
723 //process reverse primers
724 if (revPrimer.size() != 0) {
725 int primerStart = 0; int primerEnd = 0;
726 bool good = trim.findReverse(currSeq, primerStart, primerEnd);
727 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
732 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
734 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart]));
736 thisPEnd = mapAligned[primerStart]; //locations[1].insert(mapAligned[primerStart]);
737 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerStart]) + "\n";
742 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd-1]+1); }
744 currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd-1]+1));
746 thisPEnd = mapAligned[primerEnd-1]+1; //locations[1].insert(mapAligned[primerEnd-1]+1);
747 locationsString += currSeq.getName() + "\t" + toString(mapAligned[primerEnd-1]+1) + "\n";
753 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
754 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
758 }else if (ecolifile != "") {
759 //make sure the seqs are aligned
760 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
761 else if (currSeq.getAligned().length() != length) {
762 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
765 currSeq.filterToPos(start);
766 currSeq.filterFromPos(end);
768 string seqString = currSeq.getAligned().substr(0, end);
769 seqString = seqString.substr(start);
770 currSeq.setAligned(seqString);
773 }else{ //using start and end to trim
774 //make sure the seqs are aligned
775 if (!fileAligned) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
778 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
780 if (keepdots) { currSeq.filterFromPos(end); }
782 string seqString = currSeq.getAligned().substr(0, end);
783 currSeq.setAligned(seqString);
788 if (keepdots) { currSeq.filterToPos(start); }
790 string seqString = currSeq.getAligned().substr(start);
791 currSeq.setAligned(seqString);
797 //trimming removed all bases
798 if (currSeq.getUnaligned() == "") { goodSeq = false; }
801 currSeq.printSequence(goodFile);
802 if (m->debug) { m->mothurOut("[DEBUG]: " + locationsString + "\n"); }
803 if (thisPStart != -1) { locations.insert(thisPStart); }
804 if (locationsString != "") { locationsFile << locationsString; }
807 badSeqNames.insert(currSeq.getName());
808 currSeq.setName(currSeq.getName() + '|' + trashCode);
809 currSeq.printSequence(badFile);
814 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
815 unsigned long long pos = inFASTA.tellg();
816 if ((pos == -1) || (pos >= filePos.end)) { break; }
818 if (inFASTA.eof()) { break; }
822 if((count) % 100 == 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
825 if((count) % 100 != 0){ m->mothurOutJustToScreen("Processing sequence: " + toString(count)+"\n"); }
830 locationsFile.close();
832 if (m->debug) { m->mothurOut("[DEBUG]: fileAligned = " + toString(fileAligned) +'\n'); }
834 if (fileAligned && !keepdots) { //print out smallest start value and largest end value
835 if (locations.size() > 1) { adjustNeeded = true; }
836 if (primers.size() != 0) { set<int>::iterator it = locations.begin(); pstart = *it; }
841 catch(exception& e) {
842 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
846 //********************************************************************/
847 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
850 bool isAligned = false;
853 for (int i = 0; i < seq.length(); i++) {
854 if (!isalpha(seq[i])) { isAligned = true; }
855 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
856 } //ie. the 3rd base may be at spot 10 in the alignment
857 //later when we trim we want to trim from spot 10.
860 catch(exception& e) {
861 m->errorOut(e, "PcrSeqsCommand", "isAligned");
865 //**********************************************************************************************************************
866 int PcrSeqsCommand::adjustDots(string goodFasta, string locations, int pstart, int pend){
869 m->openInputFile(goodFasta, inFasta);
871 ifstream inLocations;
872 m->openInputFile(locations, inLocations);
875 m->openOutputFile(goodFasta+".temp", out);
878 //cout << pstart << '\t' << pend << endl;
879 //if (pstart > pend) { //swap them
881 while(!inFasta.eof()) {
882 if(m->control_pressed) { break; }
884 Sequence seq(inFasta); m->gobble(inFasta);
887 int thisStart = -1; int thisEnd = -1;
888 if (numFPrimers != 0) { inLocations >> name >> thisStart; m->gobble(inLocations); }
889 if (numRPrimers != 0) { inLocations >> name >> thisEnd; m->gobble(inLocations); }
892 //cout << seq.getName() << '\t' << thisStart << '\t' << thisEnd << '\t' << seq.getAligned().length() << endl;
893 //cout << seq.getName() << '\t' << pstart << '\t' << pend << endl;
895 if (name != seq.getName()) { m->mothurOut("[ERROR]: name mismatch in pcr.seqs.\n"); }
898 if (thisStart != -1) {
899 if (thisStart != pstart) {
901 for (int i = pstart; i < thisStart; i++) { dots += "."; }
902 thisEnd += dots.length();
903 dots += seq.getAligned();
904 seq.setAligned(dots);
911 if (thisEnd != pend) {
912 string dots = seq.getAligned();
913 for (int i = thisEnd; i < pend; i++) { dots += "."; }
914 seq.setAligned(dots);
918 lengths.insert(seq.getAligned().length());
921 seq.printSequence(out);
926 m->mothurRemove(locations);
927 m->mothurRemove(goodFasta);
928 m->renameFile(goodFasta+".temp", goodFasta);
930 //cout << "final lengths = \n";
931 //for (set<int>::iterator it = lengths.begin(); it != lengths.end(); it++) {
932 //cout << *it << endl;
933 // cout << lengths.count(*it) << endl;
938 catch(exception& e) {
939 m->errorOut(e, "PcrSeqsCommand", "adjustDots");
943 //***************************************************************************************************************
944 bool PcrSeqsCommand::readEcoli(){
947 m->openInputFile(ecolifile, in);
952 length = ecoli.getAligned().length();
953 start = ecoli.getStartPos();
954 end = ecoli.getEndPos();
955 }else { in.close(); m->control_pressed = true; return false; }
960 catch(exception& e) {
961 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
966 //***************************************************************************************************************
967 int PcrSeqsCommand::writeAccnos(set<string> badNames){
969 string thisOutputDir = outputDir;
970 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
971 map<string, string> variables;
972 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(fastafile));
973 string outputFileName = getOutputFileName("accnos",variables);
974 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
977 m->openOutputFile(outputFileName, out);
979 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
980 if (m->control_pressed) { break; }
981 out << (*it) << endl;
987 catch(exception& e) {
988 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
993 //***************************************************************************************************************
994 int PcrSeqsCommand::readName(set<string>& names){
996 string thisOutputDir = outputDir;
997 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
998 map<string, string> variables;
999 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(namefile));
1000 variables["[extension]"] = m->getExtension(namefile);
1001 string outputFileName = getOutputFileName("name", variables);
1004 m->openOutputFile(outputFileName, out);
1007 m->openInputFile(namefile, in);
1008 string name, firstCol, secondCol;
1010 bool wroteSomething = false;
1011 int removedCount = 0;
1014 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1016 in >> firstCol; m->gobble(in);
1019 string savedSecond = secondCol;
1020 vector<string> parsedNames;
1021 m->splitAtComma(secondCol, parsedNames);
1023 vector<string> validSecond; validSecond.clear();
1024 for (int i = 0; i < parsedNames.size(); i++) {
1025 if (names.count(parsedNames[i]) == 0) {
1026 validSecond.push_back(parsedNames[i]);
1030 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
1031 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
1032 removedCount += parsedNames.size();
1034 out << firstCol << '\t' << savedSecond << endl;
1035 wroteSomething = true;
1042 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1043 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
1045 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
1049 catch(exception& e) {
1050 m->errorOut(e, "PcrSeqsCommand", "readName");
1054 //**********************************************************************************************************************
1055 int PcrSeqsCommand::readGroup(set<string> names){
1057 string thisOutputDir = outputDir;
1058 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1059 map<string, string> variables;
1060 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(groupfile));
1061 variables["[extension]"] = m->getExtension(groupfile);
1062 string outputFileName = getOutputFileName("group", variables);
1065 m->openOutputFile(outputFileName, out);
1068 m->openInputFile(groupfile, in);
1071 bool wroteSomething = false;
1072 int removedCount = 0;
1075 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1077 in >> name; //read from first column
1078 in >> group; //read from second column
1080 //if this name is in the accnos file
1081 if (names.count(name) == 0) {
1082 wroteSomething = true;
1083 out << name << '\t' << group << endl;
1084 }else { removedCount++; }
1091 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1092 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1094 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1099 catch(exception& e) {
1100 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1104 //**********************************************************************************************************************
1105 int PcrSeqsCommand::readTax(set<string> names){
1107 string thisOutputDir = outputDir;
1108 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1109 map<string, string> variables;
1110 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(taxfile));
1111 variables["[extension]"] = m->getExtension(taxfile);
1112 string outputFileName = getOutputFileName("taxonomy", variables);
1115 m->openOutputFile(outputFileName, out);
1118 m->openInputFile(taxfile, in);
1121 bool wroteSomething = false;
1122 int removedCount = 0;
1125 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1127 in >> name; //read from first column
1128 in >> tax; //read from second column
1130 //if this name is in the accnos file
1131 if (names.count(name) == 0) {
1132 wroteSomething = true;
1133 out << name << '\t' << tax << endl;
1134 }else { removedCount++; }
1141 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1142 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1144 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1148 catch(exception& e) {
1149 m->errorOut(e, "PcrSeqsCommand", "readTax");
1153 //***************************************************************************************************************
1154 int PcrSeqsCommand::readCount(set<string> badSeqNames){
1157 m->openInputFile(countfile, in);
1158 set<string>::iterator it;
1160 map<string, string> variables;
1161 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(countfile));
1162 variables["[extension]"] = m->getExtension(countfile);
1163 string goodCountFile = getOutputFileName("count", variables);
1165 outputNames.push_back(goodCountFile); outputTypes["count"].push_back(goodCountFile);
1166 ofstream goodCountOut; m->openOutputFile(goodCountFile, goodCountOut);
1168 string headers = m->getline(in); m->gobble(in);
1169 goodCountOut << headers << endl;
1171 string name, rest; int thisTotal, removedCount; removedCount = 0;
1172 bool wroteSomething = false;
1175 if (m->control_pressed) { goodCountOut.close(); in.close(); m->mothurRemove(goodCountFile); return 0; }
1177 in >> name; m->gobble(in);
1178 in >> thisTotal; m->gobble(in);
1179 rest = m->getline(in); m->gobble(in);
1181 if (badSeqNames.count(name) != 0) { removedCount+=thisTotal; }
1183 wroteSomething = true;
1184 goodCountOut << name << '\t' << thisTotal << '\t' << rest << endl;
1188 goodCountOut.close();
1190 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1192 if (wroteSomething == false) { m->mothurOut("Your count file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1194 //check for groups that have been eliminated
1196 if (ct.testGroups(goodCountFile)) {
1197 ct.readTable(goodCountFile, true, false);
1198 ct.printTable(goodCountFile);
1201 if (m->control_pressed) { m->mothurRemove(goodCountFile); }
1203 m->mothurOut("Removed " + toString(removedCount) + " sequences from your count file."); m->mothurOutEndLine();
1209 catch(exception& e) {
1210 m->errorOut(e, "PcrSeqsCommand", "readCOunt");
1214 //***************************************************************************************************************
1216 int PcrSeqsCommand::readOligos(){
1218 oligos.read(oligosfile);
1220 if (m->control_pressed) { return false; } //error in reading oligos
1222 if (oligos.hasPairedBarcodes()) {
1223 pairedOligos = true;
1224 numFPrimers = oligos.getPairedPrimers().size();
1226 pairedOligos = false;
1227 numFPrimers = oligos.getPrimers().size();
1229 numRPrimers = oligos.getReversePrimers().size();
1231 if (oligos.getLinkers().size() != 0) { m->mothurOut("[WARNING]: pcr.seqs is not setup to remove linkers, ignoring.\n"); }
1232 if (oligos.getSpacers().size() != 0) { m->mothurOut("[WARNING]: pcr.seqs is not setup to remove spacers, ignoring.\n"); }
1237 catch(exception& e) {
1238 m->errorOut(e, "PcrSeqsCommand", "readOligos");
1243 /**************************************************************************************/