5 // Created by Sarah Westcott on 3/14/12.
6 // Copyright (c) 2012 Schloss Lab. All rights reserved.
9 #include "pcrseqscommand.h"
11 //**********************************************************************************************************************
12 vector<string> PcrSeqsCommand::setParameters(){
14 CommandParameter pfasta("fasta", "InputTypes", "", "", "none", "none", "none",false,true); parameters.push_back(pfasta);
15 CommandParameter poligos("oligos", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(poligos);
16 CommandParameter pname("name", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pname);
17 CommandParameter pgroup("group", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(pgroup);
18 CommandParameter ptax("taxonomy", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(ptax);
19 CommandParameter pecoli("ecoli", "InputTypes", "", "", "ecolioligos", "none", "none",false,false); parameters.push_back(pecoli);
20 CommandParameter pstart("start", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pstart);
21 CommandParameter pend("end", "Number", "", "-1", "", "", "",false,false); parameters.push_back(pend);
22 CommandParameter pnomatch("nomatch", "Multiple", "reject-keep", "reject", "", "", "",false,false); parameters.push_back(pnomatch);
23 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
24 CommandParameter pkeepprimer("keepprimer", "Boolean", "", "F", "", "", "",false,false); parameters.push_back(pkeepprimer);
25 CommandParameter pkeepdots("keepdots", "Boolean", "", "T", "", "", "",false,false); parameters.push_back(pkeepdots);
26 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
27 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
29 vector<string> myArray;
30 for (int i = 0; i < parameters.size(); i++) { myArray.push_back(parameters[i].name); }
34 m->errorOut(e, "PcrSeqsCommand", "setParameters");
38 //**********************************************************************************************************************
39 string PcrSeqsCommand::getHelpString(){
41 string helpString = "";
42 helpString += "The pcr.seqs command reads a fasta file.\n";
43 helpString += "The pcr.seqs command parameters are fasta, oligos, name, group, taxonomy, ecoli, start, end, nomatch, processors, keepprimer and keepdots.\n";
44 helpString += "The ecoli parameter is used to provide a fasta file containing a single reference sequence (e.g. for e. coli) this must be aligned. Mothur will trim to the start and end positions of the reference sequence.\n";
45 helpString += "The start parameter allows you to provide a starting position to trim to.\n";
46 helpString += "The end parameter allows you to provide a ending position to trim from.\n";
47 helpString += "The nomatch parameter allows you to decide what to do with sequences where the primer is not found. Default=reject, meaning remove from fasta file. if nomatch=true, then do nothing to sequence.\n";
48 helpString += "The processors parameter allows you to use multiple processors.\n";
49 helpString += "The keepprimer parameter allows you to keep the primer, default=false.\n";
50 helpString += "The keepdots parameter allows you to keep the leading and trailing .'s, default=true.\n";
51 helpString += "Note: No spaces between parameter labels (i.e. fasta), '=' and parameters (i.e.yourFasta).\n";
52 helpString += "For more details please check out the wiki http://www.mothur.org/wiki/Pcr.seqs .\n";
56 m->errorOut(e, "PcrSeqsCommand", "getHelpString");
61 //**********************************************************************************************************************
62 string PcrSeqsCommand::getOutputFileNameTag(string type, string inputName=""){
64 string outputFileName = "";
65 map<string, vector<string> >::iterator it;
67 //is this a type this command creates
68 it = outputTypes.find(type);
69 if (it == outputTypes.end()) { m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
71 if (type == "fasta") { outputFileName = "pcr.fasta"; }
72 else if (type == "taxonomy") { outputFileName = "pcr" + m->getExtension(inputName); }
73 else if (type == "group") { outputFileName = "pcr" + m->getExtension(inputName); }
74 else if (type == "name") { outputFileName = "pcr" + m->getExtension(inputName); }
75 else if (type == "accnos") { outputFileName = "bad.accnos"; }
76 else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true; }
78 return outputFileName;
81 m->errorOut(e, "PcrSeqsCommand", "getOutputFileNameTag");
85 //**********************************************************************************************************************
87 PcrSeqsCommand::PcrSeqsCommand(){
89 abort = true; calledHelp = true;
91 vector<string> tempOutNames;
92 outputTypes["fasta"] = tempOutNames;
93 outputTypes["taxonomy"] = tempOutNames;
94 outputTypes["group"] = tempOutNames;
95 outputTypes["name"] = tempOutNames;
96 outputTypes["accnos"] = tempOutNames;
99 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
103 //***************************************************************************************************************
105 PcrSeqsCommand::PcrSeqsCommand(string option) {
108 abort = false; calledHelp = false;
110 //allow user to run help
111 if(option == "help") { help(); abort = true; calledHelp = true; }
112 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
115 vector<string> myArray = setParameters();
117 OptionParser parser(option);
118 map<string,string> parameters = parser.getParameters();
120 ValidParameters validParameter;
121 map<string,string>::iterator it;
123 //check to make sure all parameters are valid for command
124 for (it = parameters.begin(); it != parameters.end(); it++) {
125 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) { abort = true; }
128 //initialize outputTypes
129 vector<string> tempOutNames;
130 outputTypes["fasta"] = tempOutNames;
131 outputTypes["taxonomy"] = tempOutNames;
132 outputTypes["group"] = tempOutNames;
133 outputTypes["name"] = tempOutNames;
134 outputTypes["accnos"] = tempOutNames;
136 //if the user changes the input directory command factory will send this info to us in the output parameter
137 string inputDir = validParameter.validFile(parameters, "inputdir", false);
138 if (inputDir == "not found"){ inputDir = ""; }
141 it = parameters.find("fasta");
142 //user has given a template file
143 if(it != parameters.end()){
144 path = m->hasPath(it->second);
145 //if the user has not given a path then, add inputdir. else leave path alone.
146 if (path == "") { parameters["fasta"] = inputDir + it->second; }
149 it = parameters.find("oligos");
150 //user has given a template file
151 if(it != parameters.end()){
152 path = m->hasPath(it->second);
153 //if the user has not given a path then, add inputdir. else leave path alone.
154 if (path == "") { parameters["oligos"] = inputDir + it->second; }
157 it = parameters.find("ecoli");
158 //user has given a template file
159 if(it != parameters.end()){
160 path = m->hasPath(it->second);
161 //if the user has not given a path then, add inputdir. else leave path alone.
162 if (path == "") { parameters["ecoli"] = inputDir + it->second; }
165 it = parameters.find("taxonomy");
166 //user has given a template file
167 if(it != parameters.end()){
168 path = m->hasPath(it->second);
169 //if the user has not given a path then, add inputdir. else leave path alone.
170 if (path == "") { parameters["taxonomy"] = inputDir + it->second; }
173 it = parameters.find("name");
174 //user has given a template file
175 if(it != parameters.end()){
176 path = m->hasPath(it->second);
177 //if the user has not given a path then, add inputdir. else leave path alone.
178 if (path == "") { parameters["name"] = inputDir + it->second; }
181 it = parameters.find("group");
182 //user has given a template file
183 if(it != parameters.end()){
184 path = m->hasPath(it->second);
185 //if the user has not given a path then, add inputdir. else leave path alone.
186 if (path == "") { parameters["group"] = inputDir + it->second; }
192 //check for required parameters
193 fastafile = validParameter.validFile(parameters, "fasta", true);
194 if (fastafile == "not found") {
195 fastafile = m->getFastaFile();
196 if (fastafile != "") { m->mothurOut("Using " + fastafile + " as input file for the fasta parameter."); m->mothurOutEndLine(); }
197 else { m->mothurOut("You have no current fastafile and the fasta parameter is required."); m->mothurOutEndLine(); abort = true; }
198 }else if (fastafile == "not open") { fastafile = ""; abort = true; }
199 else { m->setFastaFile(fastafile); }
201 //if the user changes the output directory command factory will send this info to us in the output parameter
202 outputDir = validParameter.validFile(parameters, "outputdir", false); if (outputDir == "not found"){ outputDir = m->hasPath(fastafile); }
204 //check for optional parameter and set defaults
205 // ...at some point should added some additional type checking...
207 temp = validParameter.validFile(parameters, "keepprimer", false); if (temp == "not found") { temp = "f"; }
208 keepprimer = m->isTrue(temp);
210 temp = validParameter.validFile(parameters, "keepdots", false); if (temp == "not found") { temp = "t"; }
211 keepdots = m->isTrue(temp);
213 temp = validParameter.validFile(parameters, "oligos", true);
214 if (temp == "not found"){ oligosfile = ""; }
215 else if(temp == "not open"){ oligosfile = ""; abort = true; }
216 else { oligosfile = temp; m->setOligosFile(oligosfile); }
218 ecolifile = validParameter.validFile(parameters, "ecoli", true);
219 if (ecolifile == "not found"){ ecolifile = ""; }
220 else if(ecolifile == "not open"){ ecolifile = ""; abort = true; }
222 namefile = validParameter.validFile(parameters, "name", true);
223 if (namefile == "not found"){ namefile = ""; }
224 else if(namefile == "not open"){ namefile = ""; abort = true; }
225 else { m->setNameFile(namefile); }
227 groupfile = validParameter.validFile(parameters, "group", true);
228 if (groupfile == "not found"){ groupfile = ""; }
229 else if(groupfile == "not open"){ groupfile = ""; abort = true; }
230 else { m->setGroupFile(groupfile); }
232 taxfile = validParameter.validFile(parameters, "taxonomy", true);
233 if (taxfile == "not found"){ taxfile = ""; }
234 else if(taxfile == "not open"){ taxfile = ""; abort = true; }
235 else { m->setTaxonomyFile(taxfile); }
237 temp = validParameter.validFile(parameters, "start", false); if (temp == "not found") { temp = "-1"; }
238 m->mothurConvert(temp, start);
240 temp = validParameter.validFile(parameters, "end", false); if (temp == "not found") { temp = "-1"; }
241 m->mothurConvert(temp, end);
243 temp = validParameter.validFile(parameters, "processors", false); if (temp == "not found"){ temp = m->getProcessors(); }
244 m->setProcessors(temp);
245 m->mothurConvert(temp, processors);
247 nomatch = validParameter.validFile(parameters, "nomatch", false); if (nomatch == "not found") { nomatch = "reject"; }
249 if ((nomatch != "reject") && (nomatch != "keep")) { m->mothurOut("[ERROR]: " + nomatch + " is not a valid entry for nomatch. Choices are reject and keep.\n"); abort = true; }
252 if ((oligosfile == "") && (ecolifile == "") && (start == -1) && (end == -1)) {
253 m->mothurOut("[ERROR]: You did not set any options. Please provide an oligos or ecoli file, or set start or end.\n"); abort = true;
256 if ((oligosfile == "") && (ecolifile == "") && (start < 0) && (end == -1)) { m->mothurOut("[ERROR]: Invalid start value.\n"); abort = true; }
258 if ((ecolifile != "") && (start != -1) && (end != -1)) {
259 m->mothurOut("[ERROR]: You provided an ecoli file , but set the start or end parameters. Unsure what you intend. When you provide the ecoli file, mothur thinks you want to use the start and end of the sequence in the ecoli file.\n"); abort = true;
263 if ((oligosfile != "") && (ecolifile != "")) {
264 m->mothurOut("[ERROR]: You can not use an ecoli file at the same time as an oligos file.\n"); abort = true;
267 //check to make sure you didn't forget the name file by mistake
268 if (namefile == "") {
269 vector<string> files; files.push_back(fastafile);
270 parser.getNameFile(files);
275 catch(exception& e) {
276 m->errorOut(e, "PcrSeqsCommand", "PcrSeqsCommand");
280 //***************************************************************************************************************
282 int PcrSeqsCommand::execute(){
285 if (abort == true) { if (calledHelp) { return 0; } return 2; }
287 int start = time(NULL);
289 string thisOutputDir = outputDir;
290 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
291 string trimSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("fasta");
292 outputNames.push_back(trimSeqFile); outputTypes["fasta"].push_back(trimSeqFile);
294 string badSeqFile = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + "scrap." + getOutputFileNameTag("fasta");
298 if(oligosfile != ""){ readOligos(); } if (m->control_pressed) { return 0; }
299 if(ecolifile != "") { readEcoli(); } if (m->control_pressed) { return 0; }
301 vector<unsigned long long> positions;
302 int numFastaSeqs = 0;
303 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
304 positions = m->divideFile(fastafile, processors);
305 for (int i = 0; i < (positions.size()-1); i++) { lines.push_back(linePair(positions[i], positions[(i+1)])); }
307 if (processors == 1) {
308 lines.push_back(linePair(0, 1000));
310 positions = m->setFilePosFasta(fastafile, numFastaSeqs);
311 if (positions.size() < processors) { processors = positions.size(); }
313 //figure out how many sequences you have to process
314 int numSeqsPerProcessor = numFastaSeqs / processors;
315 for (int i = 0; i < processors; i++) {
316 int startIndex = i * numSeqsPerProcessor;
317 if(i == (processors - 1)){ numSeqsPerProcessor = numFastaSeqs - i * numSeqsPerProcessor; }
318 lines.push_back(linePair(positions[startIndex], numSeqsPerProcessor));
322 if (m->control_pressed) { return 0; }
324 set<string> badNames;
325 if(processors == 1) { numFastaSeqs = driverPcr(fastafile, trimSeqFile, badSeqFile, badNames, lines[0]); }
326 else { numFastaSeqs = createProcesses(fastafile, trimSeqFile, badSeqFile, badNames); }
328 if (m->control_pressed) { return 0; }
330 //don't write or keep if blank
331 if (badNames.size() != 0) { writeAccnos(badNames); }
332 if (m->isBlank(badSeqFile)) { m->mothurRemove(badSeqFile); }
333 else { outputNames.push_back(badSeqFile); outputTypes["fasta"].push_back(badSeqFile); }
335 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
336 if (namefile != "") { readName(badNames); }
337 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
338 if (groupfile != "") { readGroup(badNames); }
339 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
340 if (taxfile != "") { readTax(badNames); }
341 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
343 m->mothurOutEndLine();
344 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
345 for (int i = 0; i < outputNames.size(); i++) { m->mothurOut(outputNames[i]); m->mothurOutEndLine(); }
346 m->mothurOutEndLine();
347 m->mothurOutEndLine();
349 //set fasta file as new current fastafile
351 itTypes = outputTypes.find("fasta");
352 if (itTypes != outputTypes.end()) {
353 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setFastaFile(current); }
356 itTypes = outputTypes.find("name");
357 if (itTypes != outputTypes.end()) {
358 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setNameFile(current); }
361 itTypes = outputTypes.find("group");
362 if (itTypes != outputTypes.end()) {
363 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setGroupFile(current); }
366 itTypes = outputTypes.find("accnos");
367 if (itTypes != outputTypes.end()) {
368 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setAccnosFile(current); }
371 itTypes = outputTypes.find("taxonomy");
372 if (itTypes != outputTypes.end()) {
373 if ((itTypes->second).size() != 0) { current = (itTypes->second)[0]; m->setTaxonomyFile(current); }
376 m->mothurOut("It took " + toString(time(NULL) - start) + " secs to screen " + toString(numFastaSeqs) + " sequences.");
377 m->mothurOutEndLine();
383 catch(exception& e) {
384 m->errorOut(e, "PcrSeqsCommand", "execute");
388 /**************************************************************************************************/
389 int PcrSeqsCommand::createProcesses(string filename, string goodFileName, string badFileName, set<string>& badSeqNames) {
392 vector<int> processIDS;
396 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
398 //loop through and create all the processes you want
399 while (process != processors) {
403 processIDS.push_back(pid); //create map from line number to pid so you can append files in correct order later
406 num = driverPcr(filename, goodFileName + toString(getpid()) + ".temp", badFileName + toString(getpid()) + ".temp", badSeqNames, lines[process]);
408 //pass numSeqs to parent
410 string tempFile = filename + toString(getpid()) + ".num.temp";
411 m->openOutputFile(tempFile, out);
412 out << num << '\t' << badSeqNames.size() << endl;
413 for (set<string>::iterator it = badSeqNames.begin(); it != badSeqNames.end(); it++) {
414 out << (*it) << endl;
420 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine();
421 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
426 num = driverPcr(filename, goodFileName, badFileName, badSeqNames, lines[0]);
428 //force parent to wait until all the processes are done
429 for (int i=0;i<processIDS.size();i++) {
430 int temp = processIDS[i];
434 for (int i = 0; i < processIDS.size(); i++) {
436 string tempFile = filename + toString(processIDS[i]) + ".num.temp";
437 m->openInputFile(tempFile, in);
438 int numBadNames = 0; string name = "";
439 if (!in.eof()) { int tempNum = 0; in >> tempNum >> numBadNames; num += tempNum; m->gobble(in); }
440 for (int j = 0; j < numBadNames; j++) {
441 in >> name; m->gobble(in);
442 badSeqNames.insert(name);
444 in.close(); m->mothurRemove(tempFile);
446 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
447 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
449 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
450 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
454 //////////////////////////////////////////////////////////////////////////////////////////////////////
455 //Windows version shared memory, so be careful when passing variables through the sumScreenData struct.
456 //Above fork() will clone, so memory is separate, but that's not the case with windows,
457 //Taking advantage of shared memory to allow both threads to add info to badSeqNames.
458 //////////////////////////////////////////////////////////////////////////////////////////////////////
460 vector<pcrData*> pDataArray;
461 DWORD dwThreadIdArray[processors-1];
462 HANDLE hThreadArray[processors-1];
464 //Create processor worker threads.
465 for( int i=0; i<processors-1; i++ ){
467 string extension = "";
468 if (i!=0) {extension += toString(i) + ".temp"; processIDS.push_back(i); }
470 // Allocate memory for thread data.
471 pcrData* tempPcr = new pcrData(filename, goodFileName+extension, badFileName+extension, m, oligosfile, ecolifile, primers, revPrimer, nomatch, keepprimer, keepdots, start, end, length, lines[i].start, lines[i].end);
472 pDataArray.push_back(tempPcr);
474 //default security attributes, thread function name, argument to thread function, use default creation flags, returns the thread identifier
475 hThreadArray[i] = CreateThread(NULL, 0, MyPcrThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);
479 num = driverPcr(filename, (goodFileName+toString(processors-1)+".temp"), (badFileName+toString(processors-1)+".temp"),badSeqNames, lines[processors-1]);
480 processIDS.push_back(processors-1);
482 //Wait until all threads have terminated.
483 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
485 //Close all thread handles and free memory allocations.
486 for(int i=0; i < pDataArray.size(); i++){
487 num += pDataArray[i]->count;
488 for (set<string>::iterator it = pDataArray[i]->badSeqNames.begin(); it != pDataArray[i]->badSeqNames.end(); it++) { badSeqNames.insert(*it); }
489 CloseHandle(hThreadArray[i]);
490 delete pDataArray[i];
493 for (int i = 0; i < processIDS.size(); i++) {
494 m->appendFiles((goodFileName + toString(processIDS[i]) + ".temp"), goodFileName);
495 m->mothurRemove((goodFileName + toString(processIDS[i]) + ".temp"));
497 m->appendFiles((badFileName + toString(processIDS[i]) + ".temp"), badFileName);
498 m->mothurRemove((badFileName + toString(processIDS[i]) + ".temp"));
506 catch(exception& e) {
507 m->errorOut(e, "PcrSeqsCommand", "createProcesses");
512 //**********************************************************************************************************************
513 int PcrSeqsCommand::driverPcr(string filename, string goodFasta, string badFasta, set<string>& badSeqNames, linePair filePos){
516 m->openOutputFile(goodFasta, goodFile);
519 m->openOutputFile(badFasta, badFile);
522 m->openInputFile(filename, inFASTA);
524 inFASTA.seekg(filePos.start);
532 if (m->control_pressed) { break; }
534 Sequence currSeq(inFASTA); m->gobble(inFASTA);
536 string trashCode = "";
537 if (currSeq.getName() != "") {
540 if (oligosfile != "") {
541 map<int, int> mapAligned;
542 bool aligned = isAligned(currSeq.getAligned(), mapAligned);
545 if (primers.size() != 0) {
546 int primerStart = 0; int primerEnd = 0;
547 bool good = findForward(currSeq, primerStart, primerEnd);
549 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "f"; }
554 if (keepdots) { currSeq.filterToPos(mapAligned[primerEnd]); }
555 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerEnd])); }
558 if (keepdots) { currSeq.filterToPos(mapAligned[primerStart]); }
559 else { currSeq.setAligned(currSeq.getAligned().substr(mapAligned[primerStart])); }
562 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(primerEnd)); }
563 else { currSeq.setAligned(currSeq.getUnaligned().substr(primerStart)); }
568 //process reverse primers
569 if (revPrimer.size() != 0) {
570 int primerStart = 0; int primerEnd = 0;
571 bool good = findReverse(currSeq, primerStart, primerEnd);
572 if(!good){ if (nomatch == "reject") { goodSeq = false; } trashCode += "r"; }
577 if (keepdots) { currSeq.filterFromPos(mapAligned[primerStart]); }
578 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerStart])); }
581 if (keepdots) { currSeq.filterFromPos(mapAligned[primerEnd]); }
582 else { currSeq.setAligned(currSeq.getAligned().substr(0, mapAligned[primerEnd])); }
586 if (!keepprimer) { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerStart)); }
587 else { currSeq.setAligned(currSeq.getUnaligned().substr(0, primerEnd)); }
591 }else if (ecolifile != "") {
592 //make sure the seqs are aligned
593 lengths.insert(currSeq.getAligned().length());
594 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
595 else if (currSeq.getAligned().length() != length) {
596 m->mothurOut("[ERROR]: seqs are not the same length as ecoli seq. When using ecoli option your sequences must be aligned and the same length as the ecoli sequence.\n"); m->control_pressed = true; break;
599 currSeq.filterToPos(start);
600 currSeq.filterFromPos(end);
602 string seqString = currSeq.getAligned().substr(0, end);
603 seqString = seqString.substr(start);
604 currSeq.setAligned(seqString);
607 }else{ //using start and end to trim
608 //make sure the seqs are aligned
609 lengths.insert(currSeq.getAligned().length());
610 if (lengths.size() > 1) { m->mothurOut("[ERROR]: seqs are not aligned. When using start and end your sequences must be aligned.\n"); m->control_pressed = true; break; }
613 if (end > currSeq.getAligned().length()) { m->mothurOut("[ERROR]: end is longer than your sequence length, aborting.\n"); m->control_pressed = true; break; }
615 if (keepdots) { currSeq.filterFromPos(end); }
617 string seqString = currSeq.getAligned().substr(0, end);
618 currSeq.setAligned(seqString);
623 if (keepdots) { currSeq.filterToPos(start); }
625 string seqString = currSeq.getAligned().substr(start);
626 currSeq.setAligned(seqString);
632 //trimming removed all bases
633 if (currSeq.getUnaligned() == "") { goodSeq = false; }
635 if(goodSeq == 1) { currSeq.printSequence(goodFile); }
637 badSeqNames.insert(currSeq.getName());
638 currSeq.setName(currSeq.getName() + '|' + trashCode);
639 currSeq.printSequence(badFile);
644 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
645 unsigned long long pos = inFASTA.tellg();
646 if ((pos == -1) || (pos >= filePos.end)) { break; }
648 if (inFASTA.eof()) { break; }
652 if((count) % 100 == 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
655 if((count) % 100 != 0){ m->mothurOut("Processing sequence: " + toString(count)); m->mothurOutEndLine(); }
663 catch(exception& e) {
664 m->errorOut(e, "PcrSeqsCommand", "driverPcr");
668 //********************************************************************/
669 bool PcrSeqsCommand::findForward(Sequence& seq, int& primerStart, int& primerEnd){
672 string rawSequence = seq.getUnaligned();
674 for(int j=0;j<primers.size();j++){
675 string oligo = primers[j];
677 if(rawSequence.length() < oligo.length()) { break; }
680 int olength = oligo.length();
681 for (int j = 0; j < rawSequence.length()-olength; j++){
682 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
683 string rawChunk = rawSequence.substr(j, olength);
684 if(compareDNASeq(oligo, rawChunk)) {
686 primerEnd = primerStart + olength;
693 primerStart = 0; primerEnd = 0;
697 catch(exception& e) {
698 m->errorOut(e, "TrimOligos", "stripForward");
702 //******************************************************************/
703 bool PcrSeqsCommand::findReverse(Sequence& seq, int& primerStart, int& primerEnd){
706 string rawSequence = seq.getUnaligned();
708 for(int i=0;i<revPrimer.size();i++){
709 string oligo = revPrimer[i];
710 if(rawSequence.length() < oligo.length()) { break; }
713 int olength = oligo.length();
714 for (int j = rawSequence.length()-olength; j >= 0; j--){
715 if (m->control_pressed) { primerStart = 0; primerEnd = 0; return false; }
716 string rawChunk = rawSequence.substr(j, olength);
718 if(compareDNASeq(oligo, rawChunk)) {
720 primerEnd = primerStart + olength;
727 primerStart = 0; primerEnd = 0;
730 catch(exception& e) {
731 m->errorOut(e, "PcrSeqsCommand", "findReverse");
735 //********************************************************************/
736 bool PcrSeqsCommand::isAligned(string seq, map<int, int>& aligned){
738 bool isAligned = false;
741 for (int i = 0; i < seq.length(); i++) {
742 if (!isalpha(seq[i])) { isAligned = true; }
743 else { aligned[countBases] = i; countBases++; } //maps location in unaligned -> location in aligned.
744 } //ie. the 3rd base may be at spot 10 in the alignment
745 //later when we trim we want to trim from spot 10.
748 catch(exception& e) {
749 m->errorOut(e, "PcrSeqsCommand", "isAligned");
753 //********************************************************************/
754 string PcrSeqsCommand::reverseOligo(string oligo){
758 for(int i=oligo.length()-1;i>=0;i--){
760 if(oligo[i] == 'A') { reverse += 'T'; }
761 else if(oligo[i] == 'T'){ reverse += 'A'; }
762 else if(oligo[i] == 'U'){ reverse += 'A'; }
764 else if(oligo[i] == 'G'){ reverse += 'C'; }
765 else if(oligo[i] == 'C'){ reverse += 'G'; }
767 else if(oligo[i] == 'R'){ reverse += 'Y'; }
768 else if(oligo[i] == 'Y'){ reverse += 'R'; }
770 else if(oligo[i] == 'M'){ reverse += 'K'; }
771 else if(oligo[i] == 'K'){ reverse += 'M'; }
773 else if(oligo[i] == 'W'){ reverse += 'W'; }
774 else if(oligo[i] == 'S'){ reverse += 'S'; }
776 else if(oligo[i] == 'B'){ reverse += 'V'; }
777 else if(oligo[i] == 'V'){ reverse += 'B'; }
779 else if(oligo[i] == 'D'){ reverse += 'H'; }
780 else if(oligo[i] == 'H'){ reverse += 'D'; }
782 else { reverse += 'N'; }
788 catch(exception& e) {
789 m->errorOut(e, "PcrSeqsCommand", "reverseOligo");
794 //***************************************************************************************************************
795 bool PcrSeqsCommand::readOligos(){
798 m->openInputFile(oligosfile, inOligos);
800 string type, oligo, group;
802 while(!inOligos.eof()){
806 if(type[0] == '#'){ //ignore
807 while (!inOligos.eof()) { char c = inOligos.get(); if (c == 10 || c == 13){ break; } } // get rest of line if there's any crap there
811 //make type case insensitive
812 for(int i=0;i<type.length();i++){ type[i] = toupper(type[i]); }
816 for(int i=0;i<oligo.length();i++){
817 oligo[i] = toupper(oligo[i]);
818 if(oligo[i] == 'U') { oligo[i] = 'T'; }
821 if(type == "FORWARD"){
822 // get rest of line in case there is a primer name
823 while (!inOligos.eof()) {
824 char c = inOligos.get();
825 if (c == 10 || c == 13){ break; }
826 else if (c == 32 || c == 9){;} //space or tab
828 primers.push_back(oligo);
829 }else if(type == "REVERSE"){
830 string oligoRC = reverseOligo(oligo);
831 revPrimer.push_back(oligoRC);
832 //cout << "oligo = " << oligo << " reverse = " << oligoRC << endl;
833 }else if(type == "BARCODE"){
835 }else if((type == "LINKER")||(type == "SPACER")) {;}
836 else{ m->mothurOut(type + " is not recognized as a valid type. Choices are forward, reverse, linker, spacer and barcode. Ignoring " + oligo + "."); m->mothurOutEndLine(); m->control_pressed = true; }
842 if ((primers.size() == 0) && (revPrimer.size() == 0)) {
843 m->mothurOut("[ERROR]: your oligos file does not contain valid primers or reverse primers. Please correct."); m->mothurOutEndLine();
844 m->control_pressed = true;
850 }catch(exception& e) {
851 m->errorOut(e, "PcrSeqsCommand", "readOligos");
855 //***************************************************************************************************************
856 bool PcrSeqsCommand::readEcoli(){
859 m->openInputFile(ecolifile, in);
864 length = ecoli.getAligned().length();
865 start = ecoli.getStartPos();
866 end = ecoli.getEndPos();
867 }else { in.close(); m->control_pressed = true; return false; }
872 catch(exception& e) {
873 m->errorOut(e, "PcrSeqsCommand", "readEcoli");
878 //***************************************************************************************************************
879 int PcrSeqsCommand::writeAccnos(set<string> badNames){
881 string thisOutputDir = outputDir;
882 if (outputDir == "") { thisOutputDir += m->hasPath(fastafile); }
883 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(fastafile)) + getOutputFileNameTag("accnos");
884 outputNames.push_back(outputFileName); outputTypes["accnos"].push_back(outputFileName);
887 m->openOutputFile(outputFileName, out);
889 for (set<string>::iterator it = badNames.begin(); it != badNames.end(); it++) {
890 if (m->control_pressed) { break; }
891 out << (*it) << endl;
897 catch(exception& e) {
898 m->errorOut(e, "PcrSeqsCommand", "writeAccnos");
903 //******************************************************************/
904 bool PcrSeqsCommand::compareDNASeq(string oligo, string seq){
907 int length = oligo.length();
909 for(int i=0;i<length;i++){
911 if(oligo[i] != seq[i]){
912 if(oligo[i] == 'A' || oligo[i] == 'T' || oligo[i] == 'G' || oligo[i] == 'C') { success = 0; }
913 else if((oligo[i] == 'N' || oligo[i] == 'I') && (seq[i] == 'N')) { success = 0; }
914 else if(oligo[i] == 'R' && (seq[i] != 'A' && seq[i] != 'G')) { success = 0; }
915 else if(oligo[i] == 'Y' && (seq[i] != 'C' && seq[i] != 'T')) { success = 0; }
916 else if(oligo[i] == 'M' && (seq[i] != 'C' && seq[i] != 'A')) { success = 0; }
917 else if(oligo[i] == 'K' && (seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
918 else if(oligo[i] == 'W' && (seq[i] != 'T' && seq[i] != 'A')) { success = 0; }
919 else if(oligo[i] == 'S' && (seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
920 else if(oligo[i] == 'B' && (seq[i] != 'C' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
921 else if(oligo[i] == 'D' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'G')) { success = 0; }
922 else if(oligo[i] == 'H' && (seq[i] != 'A' && seq[i] != 'T' && seq[i] != 'C')) { success = 0; }
923 else if(oligo[i] == 'V' && (seq[i] != 'A' && seq[i] != 'C' && seq[i] != 'G')) { success = 0; }
925 if(success == 0) { break; }
934 catch(exception& e) {
935 m->errorOut(e, "PcrSeqsCommand", "compareDNASeq");
940 //***************************************************************************************************************
941 int PcrSeqsCommand::readName(set<string>& names){
943 string thisOutputDir = outputDir;
944 if (outputDir == "") { thisOutputDir += m->hasPath(namefile); }
945 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(namefile)) + getOutputFileNameTag("name", namefile);
948 m->openOutputFile(outputFileName, out);
951 m->openInputFile(namefile, in);
952 string name, firstCol, secondCol;
954 bool wroteSomething = false;
955 int removedCount = 0;
958 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
960 in >> firstCol; m->gobble(in);
963 string savedSecond = secondCol;
964 vector<string> parsedNames;
965 m->splitAtComma(secondCol, parsedNames);
967 vector<string> validSecond; validSecond.clear();
968 for (int i = 0; i < parsedNames.size(); i++) {
969 if (names.count(parsedNames[i]) == 0) {
970 validSecond.push_back(parsedNames[i]);
974 if (validSecond.size() != parsedNames.size()) { //we want to get rid of someone, so get rid of everyone
975 for (int i = 0; i < parsedNames.size(); i++) { names.insert(parsedNames[i]); }
976 removedCount += parsedNames.size();
978 out << firstCol << '\t' << savedSecond << endl;
979 wroteSomething = true;
986 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
987 outputTypes["name"].push_back(outputFileName); outputNames.push_back(outputFileName);
989 m->mothurOut("Removed " + toString(removedCount) + " sequences from your name file."); m->mothurOutEndLine();
993 catch(exception& e) {
994 m->errorOut(e, "PcrSeqsCommand", "readName");
998 //**********************************************************************************************************************
999 int PcrSeqsCommand::readGroup(set<string> names){
1001 string thisOutputDir = outputDir;
1002 if (outputDir == "") { thisOutputDir += m->hasPath(groupfile); }
1003 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(groupfile)) + getOutputFileNameTag("group", groupfile);
1006 m->openOutputFile(outputFileName, out);
1009 m->openInputFile(groupfile, in);
1012 bool wroteSomething = false;
1013 int removedCount = 0;
1016 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1018 in >> name; //read from first column
1019 in >> group; //read from second column
1021 //if this name is in the accnos file
1022 if (names.count(name) == 0) {
1023 wroteSomething = true;
1024 out << name << '\t' << group << endl;
1025 }else { removedCount++; }
1032 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1033 outputTypes["group"].push_back(outputFileName); outputNames.push_back(outputFileName);
1035 m->mothurOut("Removed " + toString(removedCount) + " sequences from your group file."); m->mothurOutEndLine();
1040 catch(exception& e) {
1041 m->errorOut(e, "PcrSeqsCommand", "readGroup");
1045 //**********************************************************************************************************************
1046 int PcrSeqsCommand::readTax(set<string> names){
1048 string thisOutputDir = outputDir;
1049 if (outputDir == "") { thisOutputDir += m->hasPath(taxfile); }
1050 string outputFileName = thisOutputDir + m->getRootName(m->getSimpleName(taxfile)) + getOutputFileNameTag("taxonomy", taxfile);
1052 m->openOutputFile(outputFileName, out);
1055 m->openInputFile(taxfile, in);
1058 bool wroteSomething = false;
1059 int removedCount = 0;
1062 if (m->control_pressed) { in.close(); out.close(); m->mothurRemove(outputFileName); return 0; }
1064 in >> name; //read from first column
1065 in >> tax; //read from second column
1067 //if this name is in the accnos file
1068 if (names.count(name) == 0) {
1069 wroteSomething = true;
1070 out << name << '\t' << tax << endl;
1071 }else { removedCount++; }
1078 if (wroteSomething == false) { m->mothurOut("Your file contains only sequences from the .accnos file."); m->mothurOutEndLine(); }
1079 outputTypes["taxonomy"].push_back(outputFileName); outputNames.push_back(outputFileName);
1081 m->mothurOut("Removed " + toString(removedCount) + " sequences from your taxonomy file."); m->mothurOutEndLine();
1085 catch(exception& e) {
1086 m->errorOut(e, "PcrSeqsCommand", "readTax");
1090 /**************************************************************************************/